amd/common: switch to 3-spaces style

Follow-up of !4319 using the same clang-format config.

Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5310>
This commit is contained in:
Pierre-Eric Pelloux-Prayer 2020-09-07 09:58:36 +02:00
parent 82d2d73e03
commit e5fb9dca2a
22 changed files with 7198 additions and 7379 deletions

View file

@ -1,3 +0,0 @@
[*.{c,h}]
indent_style = tab
indent_size = tab

View file

@ -21,132 +21,129 @@
* SOFTWARE.
*/
#include "ac_gpu_info.h"
#include "ac_binary.h"
#include "ac_gpu_info.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include <gelf.h>
#include <libelf.h>
#include <sid.h>
#include <stdio.h>
#include <sid.h>
#define SPILLED_SGPRS 0x4
#define SPILLED_VGPRS 0x8
#define SPILLED_SGPRS 0x4
#define SPILLED_VGPRS 0x8
/* Parse configuration data in .AMDGPU.config section format. */
void ac_parse_shader_binary_config(const char *data, size_t nbytes,
unsigned wave_size,
bool really_needs_scratch,
const struct radeon_info *info,
struct ac_shader_config *conf)
void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size,
bool really_needs_scratch, const struct radeon_info *info,
struct ac_shader_config *conf)
{
uint32_t scratch_size = 0;
uint32_t scratch_size = 0;
for (size_t i = 0; i < nbytes; i += 8) {
unsigned reg = util_le32_to_cpu(*(uint32_t*)(data + i));
unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i + 4));
switch (reg) {
case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
case R_00B848_COMPUTE_PGM_RSRC1:
case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
if (wave_size == 32)
conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
else
conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
for (size_t i = 0; i < nbytes; i += 8) {
unsigned reg = util_le32_to_cpu(*(uint32_t *)(data + i));
unsigned value = util_le32_to_cpu(*(uint32_t *)(data + i + 4));
switch (reg) {
case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
case R_00B848_COMPUTE_PGM_RSRC1:
case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
if (wave_size == 32)
conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
else
conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
/* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
conf->float_mode = G_00B028_FLOAT_MODE(value);
conf->rsrc1 = value;
break;
case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
/* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B84C_COMPUTE_PGM_RSRC2:
conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
conf->rsrc2 = value;
break;
case R_00B8A0_COMPUTE_PGM_RSRC3:
conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
conf->rsrc3 = value;
break;
case R_0286CC_SPI_PS_INPUT_ENA:
conf->spi_ps_input_ena = value;
break;
case R_0286D0_SPI_PS_INPUT_ADDR:
conf->spi_ps_input_addr = value;
break;
case R_0286E8_SPI_TMPRING_SIZE:
case R_00B860_COMPUTE_TMPRING_SIZE:
/* WAVESIZE is in units of 256 dwords. */
scratch_size = value;
break;
case SPILLED_SGPRS:
conf->spilled_sgprs = value;
break;
case SPILLED_VGPRS:
conf->spilled_vgprs = value;
break;
default:
{
static bool printed;
conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
/* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
conf->float_mode = G_00B028_FLOAT_MODE(value);
conf->rsrc1 = value;
break;
case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
/* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
conf->rsrc2 = value;
break;
case R_00B84C_COMPUTE_PGM_RSRC2:
conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
conf->rsrc2 = value;
break;
case R_00B8A0_COMPUTE_PGM_RSRC3:
conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
conf->rsrc3 = value;
break;
case R_0286CC_SPI_PS_INPUT_ENA:
conf->spi_ps_input_ena = value;
break;
case R_0286D0_SPI_PS_INPUT_ADDR:
conf->spi_ps_input_addr = value;
break;
case R_0286E8_SPI_TMPRING_SIZE:
case R_00B860_COMPUTE_TMPRING_SIZE:
/* WAVESIZE is in units of 256 dwords. */
scratch_size = value;
break;
case SPILLED_SGPRS:
conf->spilled_sgprs = value;
break;
case SPILLED_VGPRS:
conf->spilled_vgprs = value;
break;
default: {
static bool printed;
if (!printed) {
fprintf(stderr, "Warning: LLVM emitted unknown "
"config register: 0x%x\n", reg);
printed = true;
}
}
break;
}
}
if (!printed) {
fprintf(stderr,
"Warning: LLVM emitted unknown "
"config register: 0x%x\n",
reg);
printed = true;
}
} break;
}
}
if (!conf->spi_ps_input_addr)
conf->spi_ps_input_addr = conf->spi_ps_input_ena;
if (!conf->spi_ps_input_addr)
conf->spi_ps_input_addr = conf->spi_ps_input_ena;
if (really_needs_scratch) {
/* sgprs spills aren't spilling */
conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
}
if (really_needs_scratch) {
/* sgprs spills aren't spilling */
conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
}
/* GFX 10.3 internally:
* - aligns VGPRS to 16 for Wave32 and 8 for Wave64
* - aligns LDS to 1024
*
* For shader-db stats, set num_vgprs that the hw actually uses.
*/
if (info->chip_class >= GFX10_3) {
conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
}
/* GFX 10.3 internally:
* - aligns VGPRS to 16 for Wave32 and 8 for Wave64
* - aligns LDS to 1024
*
* For shader-db stats, set num_vgprs that the hw actually uses.
*/
if (info->chip_class >= GFX10_3) {
conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
}
/* Enable 64-bit and 16-bit denormals, because there is no performance
* cost.
*
* Don't enable denormals for 32-bit floats, because:
* - denormals disable output modifiers
* - denormals break v_mad_f32
* - GFX6 & GFX7 would be very slow
*/
conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
conf->float_mode |= V_00B028_FP_64_DENORMS;
/* Enable 64-bit and 16-bit denormals, because there is no performance
* cost.
*
* Don't enable denormals for 32-bit floats, because:
* - denormals disable output modifiers
* - denormals break v_mad_f32
* - GFX6 & GFX7 would be very slow
*/
conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
conf->float_mode |= V_00B028_FP_64_DENORMS;
}

View file

@ -24,9 +24,9 @@
#ifndef AC_BINARY_H
#define AC_BINARY_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
@ -35,26 +35,24 @@ extern "C" {
struct radeon_info;
struct ac_shader_config {
unsigned num_sgprs;
unsigned num_vgprs;
unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */
unsigned spilled_sgprs;
unsigned spilled_vgprs;
unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */
unsigned spi_ps_input_ena;
unsigned spi_ps_input_addr;
unsigned float_mode;
unsigned scratch_bytes_per_wave;
unsigned rsrc1;
unsigned rsrc2;
unsigned rsrc3;
unsigned num_sgprs;
unsigned num_vgprs;
unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */
unsigned spilled_sgprs;
unsigned spilled_vgprs;
unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */
unsigned spi_ps_input_ena;
unsigned spi_ps_input_addr;
unsigned float_mode;
unsigned scratch_bytes_per_wave;
unsigned rsrc1;
unsigned rsrc2;
unsigned rsrc3;
};
void ac_parse_shader_binary_config(const char *data, size_t nbytes,
unsigned wave_size,
bool really_needs_scratch,
const struct radeon_info *info,
struct ac_shader_config *conf);
void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size,
bool really_needs_scratch, const struct radeon_info *info,
struct ac_shader_config *conf);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load diff

View file

@ -24,15 +24,15 @@
#ifndef AC_DEBUG_H
#define AC_DEBUG_H
#include <stdint.h>
#include <stdio.h>
#include <stdbool.h>
#include "amd_family.h"
#define AC_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id) & 0xffff))
#define AC_IS_TRACE_POINT(x) (((x) & 0xcafe0000) == 0xcafe0000)
#define AC_GET_TRACE_POINT_ID(x) ((x) & 0xffff)
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#define AC_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id)&0xffff))
#define AC_IS_TRACE_POINT(x) (((x)&0xcafe0000) == 0xcafe0000)
#define AC_GET_TRACE_POINT_ID(x) ((x)&0xffff)
#define AC_MAX_WAVES_PER_CHIP (64 * 40)
@ -41,36 +41,36 @@ extern "C" {
#endif
struct ac_wave_info {
unsigned se; /* shader engine */
unsigned sh; /* shader array */
unsigned cu; /* compute unit */
unsigned simd;
unsigned wave;
uint32_t status;
uint64_t pc; /* program counter */
uint32_t inst_dw0;
uint32_t inst_dw1;
uint64_t exec;
bool matched; /* whether the wave is used by a currently-bound shader */
unsigned se; /* shader engine */
unsigned sh; /* shader array */
unsigned cu; /* compute unit */
unsigned simd;
unsigned wave;
uint32_t status;
uint64_t pc; /* program counter */
uint32_t inst_dw0;
uint32_t inst_dw1;
uint64_t exec;
bool matched; /* whether the wave is used by a currently-bound shader */
};
typedef void *(*ac_debug_addr_callback)(void *data, uint64_t addr);
const char *ac_get_register_name(enum chip_class chip_class, unsigned offset);
void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset,
uint32_t value, uint32_t field_mask);
void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32_t value,
uint32_t field_mask);
void ac_parse_ib_chunk(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
unsigned trace_id_count, enum chip_class chip_class,
ac_debug_addr_callback addr_callback, void *addr_callback_data);
void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
unsigned trace_id_count, const char *name, enum chip_class chip_class,
ac_debug_addr_callback addr_callback, void *addr_callback_data);
unsigned trace_id_count, enum chip_class chip_class,
ac_debug_addr_callback addr_callback, void *addr_callback_data);
void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, unsigned trace_id_count,
const char *name, enum chip_class chip_class, ac_debug_addr_callback addr_callback,
void *addr_callback_data);
bool ac_vm_fault_occured(enum chip_class chip_class,
uint64_t *old_dmesg_timestamp, uint64_t *out_addr);
bool ac_vm_fault_occured(enum chip_class chip_class, uint64_t *old_dmesg_timestamp,
uint64_t *out_addr);
unsigned ac_get_wave_info(enum chip_class chip_class,
struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]);
struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]);
#ifdef __cplusplus
}

View file

@ -25,16 +25,17 @@
#ifndef AC_EXP_PARAM_H
#define AC_EXP_PARAM_H
enum {
/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
AC_EXP_PARAM_OFFSET_0 = 0,
AC_EXP_PARAM_OFFSET_31 = 31,
/* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
AC_EXP_PARAM_DEFAULT_VAL_0000 = 64,
AC_EXP_PARAM_DEFAULT_VAL_0001,
AC_EXP_PARAM_DEFAULT_VAL_1110,
AC_EXP_PARAM_DEFAULT_VAL_1111,
AC_EXP_PARAM_UNDEFINED = 255,
enum
{
/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
AC_EXP_PARAM_OFFSET_0 = 0,
AC_EXP_PARAM_OFFSET_31 = 31,
/* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
AC_EXP_PARAM_DEFAULT_VAL_0000 = 64,
AC_EXP_PARAM_DEFAULT_VAL_0001,
AC_EXP_PARAM_DEFAULT_VAL_1110,
AC_EXP_PARAM_DEFAULT_VAL_1111,
AC_EXP_PARAM_UNDEFINED = 255,
};
#endif

File diff suppressed because it is too large Load diff

View file

@ -26,10 +26,11 @@
#ifndef AC_GPU_INFO_H
#define AC_GPU_INFO_H
#include "amd_family.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include "amd_family.h"
#ifdef __cplusplus
extern "C" {
@ -38,186 +39,179 @@ extern "C" {
struct amdgpu_gpu_info;
struct radeon_info {
/* PCI info: domain:bus:dev:func */
uint32_t pci_domain;
uint32_t pci_bus;
uint32_t pci_dev;
uint32_t pci_func;
/* PCI info: domain:bus:dev:func */
uint32_t pci_domain;
uint32_t pci_bus;
uint32_t pci_dev;
uint32_t pci_func;
/* Device info. */
const char *name;
const char *marketing_name;
bool is_pro_graphics;
uint32_t pci_id;
uint32_t pci_rev_id;
enum radeon_family family;
enum chip_class chip_class;
uint32_t family_id;
uint32_t chip_external_rev;
uint32_t clock_crystal_freq;
/* Device info. */
const char *name;
const char *marketing_name;
bool is_pro_graphics;
uint32_t pci_id;
uint32_t pci_rev_id;
enum radeon_family family;
enum chip_class chip_class;
uint32_t family_id;
uint32_t chip_external_rev;
uint32_t clock_crystal_freq;
/* Features. */
bool has_graphics; /* false if the chip is compute-only */
uint32_t num_rings[NUM_RING_TYPES];
uint32_t ib_pad_dw_mask[NUM_RING_TYPES];
bool has_clear_state;
bool has_distributed_tess;
bool has_dcc_constant_encode;
bool has_rbplus; /* if RB+ registers exist */
bool rbplus_allowed; /* if RB+ is allowed */
bool has_load_ctx_reg_pkt;
bool has_out_of_order_rast;
bool has_packed_math_16bit;
bool cpdma_prefetch_writes_memory;
bool has_gfx9_scissor_bug;
bool has_tc_compat_zrange_bug;
bool has_msaa_sample_loc_bug;
bool has_ls_vgpr_init_bug;
/* Features. */
bool has_graphics; /* false if the chip is compute-only */
uint32_t num_rings[NUM_RING_TYPES];
uint32_t ib_pad_dw_mask[NUM_RING_TYPES];
bool has_clear_state;
bool has_distributed_tess;
bool has_dcc_constant_encode;
bool has_rbplus; /* if RB+ registers exist */
bool rbplus_allowed; /* if RB+ is allowed */
bool has_load_ctx_reg_pkt;
bool has_out_of_order_rast;
bool has_packed_math_16bit;
bool cpdma_prefetch_writes_memory;
bool has_gfx9_scissor_bug;
bool has_tc_compat_zrange_bug;
bool has_msaa_sample_loc_bug;
bool has_ls_vgpr_init_bug;
/* Display features. */
/* There are 2 display DCC codepaths, because display expects unaligned DCC. */
/* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */
bool use_display_dcc_unaligned;
/* Allocate both aligned and unaligned DCC and use the retile blit. */
bool use_display_dcc_with_retile_blit;
/* Display features. */
/* There are 2 display DCC codepaths, because display expects unaligned DCC. */
/* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */
bool use_display_dcc_unaligned;
/* Allocate both aligned and unaligned DCC and use the retile blit. */
bool use_display_dcc_with_retile_blit;
/* Memory info. */
uint32_t pte_fragment_size;
uint32_t gart_page_size;
uint64_t gart_size;
uint64_t vram_size;
uint64_t vram_vis_size;
uint32_t vram_bit_width;
uint32_t vram_type;
unsigned gds_size;
unsigned gds_gfx_partition_size;
uint64_t max_alloc_size;
uint32_t min_alloc_size;
uint32_t address32_hi;
bool has_dedicated_vram;
bool has_l2_uncached;
bool r600_has_virtual_memory;
uint32_t num_sdp_interfaces;
uint32_t num_tcc_blocks;
uint32_t tcc_cache_line_size;
bool tcc_harvested;
unsigned pc_lines;
uint32_t lds_size_per_workgroup;
uint32_t lds_granularity;
uint32_t max_memory_clock;
uint32_t ce_ram_size;
uint32_t l1_cache_size;
uint32_t l2_cache_size;
/* Memory info. */
uint32_t pte_fragment_size;
uint32_t gart_page_size;
uint64_t gart_size;
uint64_t vram_size;
uint64_t vram_vis_size;
uint32_t vram_bit_width;
uint32_t vram_type;
unsigned gds_size;
unsigned gds_gfx_partition_size;
uint64_t max_alloc_size;
uint32_t min_alloc_size;
uint32_t address32_hi;
bool has_dedicated_vram;
bool has_l2_uncached;
bool r600_has_virtual_memory;
uint32_t num_sdp_interfaces;
uint32_t num_tcc_blocks;
uint32_t tcc_cache_line_size;
bool tcc_harvested;
unsigned pc_lines;
uint32_t lds_size_per_workgroup;
uint32_t lds_granularity;
uint32_t max_memory_clock;
uint32_t ce_ram_size;
uint32_t l1_cache_size;
uint32_t l2_cache_size;
/* CP info. */
bool gfx_ib_pad_with_type2;
unsigned ib_alignment; /* both start and size alignment */
uint32_t me_fw_version;
uint32_t me_fw_feature;
uint32_t pfp_fw_version;
uint32_t pfp_fw_feature;
uint32_t ce_fw_version;
uint32_t ce_fw_feature;
/* CP info. */
bool gfx_ib_pad_with_type2;
unsigned ib_alignment; /* both start and size alignment */
uint32_t me_fw_version;
uint32_t me_fw_feature;
uint32_t pfp_fw_version;
uint32_t pfp_fw_feature;
uint32_t ce_fw_version;
uint32_t ce_fw_feature;
/* Multimedia info. */
bool has_hw_decode;
bool uvd_enc_supported;
uint32_t uvd_fw_version;
uint32_t vce_fw_version;
uint32_t vce_harvest_config;
/* Multimedia info. */
bool has_hw_decode;
bool uvd_enc_supported;
uint32_t uvd_fw_version;
uint32_t vce_fw_version;
uint32_t vce_harvest_config;
/* Kernel & winsys capabilities. */
uint32_t drm_major; /* version */
uint32_t drm_minor;
uint32_t drm_patchlevel;
bool is_amdgpu;
bool has_userptr;
bool has_syncobj;
bool has_syncobj_wait_for_submit;
bool has_timeline_syncobj;
bool has_fence_to_handle;
bool has_ctx_priority;
bool has_local_buffers;
bool kernel_flushes_hdp_before_ib;
bool htile_cmask_support_1d_tiling;
bool si_TA_CS_BC_BASE_ADDR_allowed;
bool has_bo_metadata;
bool has_gpu_reset_status_query;
bool has_eqaa_surface_allocator;
bool has_format_bc1_through_bc7;
bool kernel_flushes_tc_l2_after_ib;
bool has_indirect_compute_dispatch;
bool has_unaligned_shader_loads;
bool has_sparse_vm_mappings;
bool has_2d_tiling;
bool has_read_registers_query;
bool has_gds_ordered_append;
bool has_scheduled_fence_dependency;
/* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */
bool mid_command_buffer_preemption_enabled;
/* Kernel & winsys capabilities. */
uint32_t drm_major; /* version */
uint32_t drm_minor;
uint32_t drm_patchlevel;
bool is_amdgpu;
bool has_userptr;
bool has_syncobj;
bool has_syncobj_wait_for_submit;
bool has_timeline_syncobj;
bool has_fence_to_handle;
bool has_ctx_priority;
bool has_local_buffers;
bool kernel_flushes_hdp_before_ib;
bool htile_cmask_support_1d_tiling;
bool si_TA_CS_BC_BASE_ADDR_allowed;
bool has_bo_metadata;
bool has_gpu_reset_status_query;
bool has_eqaa_surface_allocator;
bool has_format_bc1_through_bc7;
bool kernel_flushes_tc_l2_after_ib;
bool has_indirect_compute_dispatch;
bool has_unaligned_shader_loads;
bool has_sparse_vm_mappings;
bool has_2d_tiling;
bool has_read_registers_query;
bool has_gds_ordered_append;
bool has_scheduled_fence_dependency;
/* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */
bool mid_command_buffer_preemption_enabled;
/* Shader cores. */
uint32_t cu_mask[4][2];
uint32_t r600_max_quad_pipes; /* wave size / 16 */
uint32_t max_shader_clock;
uint32_t num_good_compute_units;
uint32_t max_good_cu_per_sa;
uint32_t min_good_cu_per_sa; /* min != max if SAs have different # of CUs */
uint32_t max_se; /* shader engines */
uint32_t max_sh_per_se; /* shader arrays per shader engine */
uint32_t max_wave64_per_simd;
uint32_t num_physical_sgprs_per_simd;
uint32_t num_physical_wave64_vgprs_per_simd;
uint32_t num_simd_per_compute_unit;
uint32_t min_sgpr_alloc;
uint32_t max_sgpr_alloc;
uint32_t sgpr_alloc_granularity;
uint32_t min_wave64_vgpr_alloc;
uint32_t max_vgpr_alloc;
uint32_t wave64_vgpr_alloc_granularity;
bool use_late_alloc; /* VS and GS: late pos/param allocation */
/* Shader cores. */
uint32_t cu_mask[4][2];
uint32_t r600_max_quad_pipes; /* wave size / 16 */
uint32_t max_shader_clock;
uint32_t num_good_compute_units;
uint32_t max_good_cu_per_sa;
uint32_t min_good_cu_per_sa; /* min != max if SAs have different # of CUs */
uint32_t max_se; /* shader engines */
uint32_t max_sh_per_se; /* shader arrays per shader engine */
uint32_t max_wave64_per_simd;
uint32_t num_physical_sgprs_per_simd;
uint32_t num_physical_wave64_vgprs_per_simd;
uint32_t num_simd_per_compute_unit;
uint32_t min_sgpr_alloc;
uint32_t max_sgpr_alloc;
uint32_t sgpr_alloc_granularity;
uint32_t min_wave64_vgpr_alloc;
uint32_t max_vgpr_alloc;
uint32_t wave64_vgpr_alloc_granularity;
bool use_late_alloc; /* VS and GS: late pos/param allocation */
/* Render backends (color + depth blocks). */
uint32_t r300_num_gb_pipes;
uint32_t r300_num_z_pipes;
uint32_t r600_gb_backend_map; /* R600 harvest config */
bool r600_gb_backend_map_valid;
uint32_t r600_num_banks;
uint32_t gb_addr_config;
uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */
uint32_t num_render_backends;
uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */
uint32_t pipe_interleave_bytes;
uint32_t enabled_rb_mask; /* GCN harvest config */
uint64_t max_alignment; /* from addrlib */
uint32_t pbb_max_alloc_count;
/* Render backends (color + depth blocks). */
uint32_t r300_num_gb_pipes;
uint32_t r300_num_z_pipes;
uint32_t r600_gb_backend_map; /* R600 harvest config */
bool r600_gb_backend_map_valid;
uint32_t r600_num_banks;
uint32_t gb_addr_config;
uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */
uint32_t num_render_backends;
uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */
uint32_t pipe_interleave_bytes;
uint32_t enabled_rb_mask; /* GCN harvest config */
uint64_t max_alignment; /* from addrlib */
uint32_t pbb_max_alloc_count;
/* Tile modes. */
uint32_t si_tile_mode_array[32];
uint32_t cik_macrotile_mode_array[16];
/* Tile modes. */
uint32_t si_tile_mode_array[32];
uint32_t cik_macrotile_mode_array[16];
};
bool ac_query_gpu_info(int fd, void *dev_p,
struct radeon_info *info,
struct amdgpu_gpu_info *amdinfo);
bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
struct amdgpu_gpu_info *amdinfo);
void ac_compute_driver_uuid(char *uuid, size_t size);
void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size);
void ac_print_gpu_info(struct radeon_info *info);
int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family);
void ac_get_raster_config(struct radeon_info *info,
uint32_t *raster_config_p,
uint32_t *raster_config_1_p,
uint32_t *se_tile_repeat_p);
void ac_get_harvested_configs(struct radeon_info *info,
unsigned raster_config,
unsigned *cik_raster_config_1_p,
unsigned *raster_config_se);
unsigned ac_get_compute_resource_limits(struct radeon_info *info,
unsigned waves_per_threadgroup,
unsigned max_waves_per_sh,
unsigned threadgroups_per_cu);
void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p,
uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p);
void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config,
unsigned *cik_raster_config_1_p, unsigned *raster_config_se);
unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup,
unsigned max_waves_per_sh, unsigned threadgroups_per_cu);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load diff

View file

@ -24,12 +24,12 @@
#ifndef AC_RTLD_H
#define AC_RTLD_H
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
#include "util/u_dynarray.h"
#include "compiler/shader_enums.h"
#include "util/u_dynarray.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
@ -40,37 +40,37 @@ struct ac_shader_config;
struct radeon_info;
struct ac_rtld_symbol {
const char *name;
uint32_t size;
uint32_t align;
uint64_t offset; /* filled in by ac_rtld_open */
unsigned part_idx; /* shader part in which this symbol appears */
const char *name;
uint32_t size;
uint32_t align;
uint64_t offset; /* filled in by ac_rtld_open */
unsigned part_idx; /* shader part in which this symbol appears */
};
struct ac_rtld_options {
/* Loader will insert an s_sethalt 1 instruction as the
* first instruction. */
bool halt_at_entry:1;
/* Loader will insert an s_sethalt 1 instruction as the
* first instruction. */
bool halt_at_entry : 1;
};
/* Lightweight wrapper around underlying ELF objects. */
struct ac_rtld_binary {
struct ac_rtld_options options;
unsigned wave_size;
struct ac_rtld_options options;
unsigned wave_size;
/* Required buffer sizes, currently read/executable only. */
uint64_t rx_size;
/* Required buffer sizes, currently read/executable only. */
uint64_t rx_size;
/* Size of executable code, for reporting purposes. */
uint64_t exec_size;
/* Size of executable code, for reporting purposes. */
uint64_t exec_size;
uint64_t rx_end_markers;
uint64_t rx_end_markers;
unsigned num_parts;
struct ac_rtld_part *parts;
unsigned num_parts;
struct ac_rtld_part *parts;
struct util_dynarray lds_symbols;
uint32_t lds_size;
struct util_dynarray lds_symbols;
uint32_t lds_size;
};
/**
@ -82,8 +82,7 @@ struct ac_rtld_binary {
* \param value to be filled in by the callback
* \return whether the symbol was found successfully
*/
typedef bool (*ac_rtld_get_external_symbol_cb)(
void *cb_data, const char *symbol, uint64_t *value);
typedef bool (*ac_rtld_get_external_symbol_cb)(void *cb_data, const char *symbol, uint64_t *value);
/**
* Lifetimes of \ref info, in-memory ELF objects, and the names of
@ -91,50 +90,48 @@ typedef bool (*ac_rtld_get_external_symbol_cb)(
* the opened binary.
*/
struct ac_rtld_open_info {
const struct radeon_info *info;
struct ac_rtld_options options;
gl_shader_stage shader_type;
unsigned wave_size;
const struct radeon_info *info;
struct ac_rtld_options options;
gl_shader_stage shader_type;
unsigned wave_size;
unsigned num_parts;
const char * const *elf_ptrs; /* in-memory ELF objects of each part */
const size_t *elf_sizes; /* sizes of corresponding in-memory ELF objects in bytes */
unsigned num_parts;
const char *const *elf_ptrs; /* in-memory ELF objects of each part */
const size_t *elf_sizes; /* sizes of corresponding in-memory ELF objects in bytes */
/* Shared LDS symbols are layouted such that they are accessible from
* all shader parts. Non-shared (private) LDS symbols of one part may
* overlap private LDS symbols of another shader part.
*/
unsigned num_shared_lds_symbols;
const struct ac_rtld_symbol *shared_lds_symbols;
/* Shared LDS symbols are layouted such that they are accessible from
* all shader parts. Non-shared (private) LDS symbols of one part may
* overlap private LDS symbols of another shader part.
*/
unsigned num_shared_lds_symbols;
const struct ac_rtld_symbol *shared_lds_symbols;
};
bool ac_rtld_open(struct ac_rtld_binary *binary,
struct ac_rtld_open_info i);
bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i);
void ac_rtld_close(struct ac_rtld_binary *binary);
bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name,
const char **data, size_t *nbytes);
bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, const char **data,
size_t *nbytes);
bool ac_rtld_read_config(const struct radeon_info *info,
struct ac_rtld_binary *binary,
struct ac_shader_config *config);
bool ac_rtld_read_config(const struct radeon_info *info, struct ac_rtld_binary *binary,
struct ac_shader_config *config);
struct ac_rtld_upload_info {
struct ac_rtld_binary *binary;
struct ac_rtld_binary *binary;
/** GPU mapping of the read/executable buffer. */
uint64_t rx_va;
/** GPU mapping of the read/executable buffer. */
uint64_t rx_va;
/** CPU mapping of the read/executable buffer */
char *rx_ptr;
/** CPU mapping of the read/executable buffer */
char *rx_ptr;
/** Optional callback function that will be queried for symbols not
* defined in any of the binary's parts. */
ac_rtld_get_external_symbol_cb get_external_symbol;
/** Optional callback function that will be queried for symbols not
* defined in any of the binary's parts. */
ac_rtld_get_external_symbol_cb get_external_symbol;
/** Caller-defined data that will be passed to callback functions. */
void *cb_data;
/** Caller-defined data that will be passed to callback functions. */
void *cb_data;
};
bool ac_rtld_upload(struct ac_rtld_upload_info *u);

View file

@ -22,34 +22,33 @@
*/
#include "ac_shader_args.h"
#include "nir/nir_builder.h"
void
ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile,
unsigned size, enum ac_arg_type type, struct ac_arg *arg)
void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned size,
enum ac_arg_type type, struct ac_arg *arg)
{
assert(info->arg_count < AC_MAX_ARGS);
assert(info->arg_count < AC_MAX_ARGS);
unsigned offset;
if (regfile == AC_ARG_SGPR) {
offset = info->num_sgprs_used;
info->num_sgprs_used += size;
} else {
assert(regfile == AC_ARG_VGPR);
offset = info->num_vgprs_used;
info->num_vgprs_used += size;
}
unsigned offset;
if (regfile == AC_ARG_SGPR) {
offset = info->num_sgprs_used;
info->num_sgprs_used += size;
} else {
assert(regfile == AC_ARG_VGPR);
offset = info->num_vgprs_used;
info->num_vgprs_used += size;
}
info->args[info->arg_count].file = regfile;
info->args[info->arg_count].offset = offset;
info->args[info->arg_count].size = size;
info->args[info->arg_count].type = type;
info->args[info->arg_count].file = regfile;
info->args[info->arg_count].offset = offset;
info->args[info->arg_count].size = size;
info->args[info->arg_count].type = type;
if (arg) {
arg->arg_index = info->arg_count;
arg->used = true;
}
if (arg) {
arg->arg_index = info->arg_count;
arg->used = true;
}
info->arg_count++;
info->arg_count++;
}

View file

@ -24,91 +24,90 @@
#ifndef AC_SHADER_ARGS_H
#define AC_SHADER_ARGS_H
#include <stdint.h>
#include <stdbool.h>
#include <stdint.h>
#define AC_MAX_INLINE_PUSH_CONSTS 8
enum ac_arg_regfile {
AC_ARG_SGPR,
AC_ARG_VGPR,
enum ac_arg_regfile
{
AC_ARG_SGPR,
AC_ARG_VGPR,
};
enum ac_arg_type {
AC_ARG_FLOAT,
AC_ARG_INT,
AC_ARG_CONST_PTR, /* Pointer to i8 array */
AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */
AC_ARG_CONST_PTR_PTR, /* Pointer to pointer to i8 array */
AC_ARG_CONST_DESC_PTR, /* Pointer to v4i32 array */
AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */
enum ac_arg_type
{
AC_ARG_FLOAT,
AC_ARG_INT,
AC_ARG_CONST_PTR, /* Pointer to i8 array */
AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */
AC_ARG_CONST_PTR_PTR, /* Pointer to pointer to i8 array */
AC_ARG_CONST_DESC_PTR, /* Pointer to v4i32 array */
AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */
};
struct ac_arg {
uint8_t arg_index;
bool used;
uint8_t arg_index;
bool used;
};
#define AC_MAX_ARGS 128
struct ac_shader_args {
/* Info on how to declare arguments */
struct {
enum ac_arg_type type;
enum ac_arg_regfile file;
uint8_t offset;
uint8_t size;
bool skip;
} args[AC_MAX_ARGS];
/* Info on how to declare arguments */
struct {
enum ac_arg_type type;
enum ac_arg_regfile file;
uint8_t offset;
uint8_t size;
bool skip;
} args[AC_MAX_ARGS];
uint8_t arg_count;
uint8_t sgpr_count;
uint8_t num_sgprs_used;
uint8_t num_vgprs_used;
uint8_t arg_count;
uint8_t sgpr_count;
uint8_t num_sgprs_used;
uint8_t num_vgprs_used;
struct ac_arg base_vertex;
struct ac_arg start_instance;
struct ac_arg draw_id;
struct ac_arg vertex_id;
struct ac_arg instance_id;
struct ac_arg tcs_patch_id;
struct ac_arg tcs_rel_ids;
struct ac_arg tes_patch_id;
struct ac_arg gs_prim_id;
struct ac_arg gs_invocation_id;
struct ac_arg base_vertex;
struct ac_arg start_instance;
struct ac_arg draw_id;
struct ac_arg vertex_id;
struct ac_arg instance_id;
struct ac_arg tcs_patch_id;
struct ac_arg tcs_rel_ids;
struct ac_arg tes_patch_id;
struct ac_arg gs_prim_id;
struct ac_arg gs_invocation_id;
/* PS */
struct ac_arg frag_pos[4];
struct ac_arg front_face;
struct ac_arg ancillary;
struct ac_arg sample_coverage;
struct ac_arg prim_mask;
struct ac_arg persp_sample;
struct ac_arg persp_center;
struct ac_arg persp_centroid;
struct ac_arg pull_model;
struct ac_arg linear_sample;
struct ac_arg linear_center;
struct ac_arg linear_centroid;
/* PS */
struct ac_arg frag_pos[4];
struct ac_arg front_face;
struct ac_arg ancillary;
struct ac_arg sample_coverage;
struct ac_arg prim_mask;
struct ac_arg persp_sample;
struct ac_arg persp_center;
struct ac_arg persp_centroid;
struct ac_arg pull_model;
struct ac_arg linear_sample;
struct ac_arg linear_center;
struct ac_arg linear_centroid;
/* CS */
struct ac_arg local_invocation_ids;
struct ac_arg num_work_groups;
struct ac_arg workgroup_ids[3];
struct ac_arg tg_size;
/* CS */
struct ac_arg local_invocation_ids;
struct ac_arg num_work_groups;
struct ac_arg workgroup_ids[3];
struct ac_arg tg_size;
/* Vulkan only */
struct ac_arg push_constants;
struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
unsigned num_inline_push_consts;
unsigned base_inline_push_consts;
struct ac_arg view_index;
/* Vulkan only */
struct ac_arg push_constants;
struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
unsigned num_inline_push_consts;
unsigned base_inline_push_consts;
struct ac_arg view_index;
};
void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile,
unsigned registers, enum ac_arg_type type,
struct ac_arg *arg);
void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned registers,
enum ac_arg_type type, struct ac_arg *arg);
#endif

View file

@ -21,277 +21,303 @@
* IN THE SOFTWARE.
*/
#include "ac_shader_util.h"
#include "sid.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "ac_shader_util.h"
#include "sid.h"
unsigned
ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
bool writes_samplemask)
unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask)
{
if (writes_z) {
/* Z needs 32 bits. */
if (writes_samplemask)
return V_028710_SPI_SHADER_32_ABGR;
else if (writes_stencil)
return V_028710_SPI_SHADER_32_GR;
else
return V_028710_SPI_SHADER_32_R;
} else if (writes_stencil || writes_samplemask) {
/* Both stencil and sample mask need only 16 bits. */
return V_028710_SPI_SHADER_UINT16_ABGR;
} else {
return V_028710_SPI_SHADER_ZERO;
}
if (writes_z) {
/* Z needs 32 bits. */
if (writes_samplemask)
return V_028710_SPI_SHADER_32_ABGR;
else if (writes_stencil)
return V_028710_SPI_SHADER_32_GR;
else
return V_028710_SPI_SHADER_32_R;
} else if (writes_stencil || writes_samplemask) {
/* Both stencil and sample mask need only 16 bits. */
return V_028710_SPI_SHADER_UINT16_ABGR;
} else {
return V_028710_SPI_SHADER_ZERO;
}
}
unsigned
ac_get_cb_shader_mask(unsigned spi_shader_col_format)
unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
{
unsigned i, cb_shader_mask = 0;
unsigned i, cb_shader_mask = 0;
for (i = 0; i < 8; i++) {
switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
case V_028714_SPI_SHADER_ZERO:
break;
case V_028714_SPI_SHADER_32_R:
cb_shader_mask |= 0x1 << (i * 4);
break;
case V_028714_SPI_SHADER_32_GR:
cb_shader_mask |= 0x3 << (i * 4);
break;
case V_028714_SPI_SHADER_32_AR:
cb_shader_mask |= 0x9u << (i * 4);
break;
case V_028714_SPI_SHADER_FP16_ABGR:
case V_028714_SPI_SHADER_UNORM16_ABGR:
case V_028714_SPI_SHADER_SNORM16_ABGR:
case V_028714_SPI_SHADER_UINT16_ABGR:
case V_028714_SPI_SHADER_SINT16_ABGR:
case V_028714_SPI_SHADER_32_ABGR:
cb_shader_mask |= 0xfu << (i * 4);
break;
default:
assert(0);
}
}
return cb_shader_mask;
for (i = 0; i < 8; i++) {
switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
case V_028714_SPI_SHADER_ZERO:
break;
case V_028714_SPI_SHADER_32_R:
cb_shader_mask |= 0x1 << (i * 4);
break;
case V_028714_SPI_SHADER_32_GR:
cb_shader_mask |= 0x3 << (i * 4);
break;
case V_028714_SPI_SHADER_32_AR:
cb_shader_mask |= 0x9u << (i * 4);
break;
case V_028714_SPI_SHADER_FP16_ABGR:
case V_028714_SPI_SHADER_UNORM16_ABGR:
case V_028714_SPI_SHADER_SNORM16_ABGR:
case V_028714_SPI_SHADER_UINT16_ABGR:
case V_028714_SPI_SHADER_SINT16_ABGR:
case V_028714_SPI_SHADER_32_ABGR:
cb_shader_mask |= 0xfu << (i * 4);
break;
default:
assert(0);
}
}
return cb_shader_mask;
}
/**
* Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
* geometry shader.
*/
uint32_t
ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
{
unsigned cut_mode;
unsigned cut_mode;
if (gs_max_vert_out <= 128) {
cut_mode = V_028A40_GS_CUT_128;
} else if (gs_max_vert_out <= 256) {
cut_mode = V_028A40_GS_CUT_256;
} else if (gs_max_vert_out <= 512) {
cut_mode = V_028A40_GS_CUT_512;
} else {
assert(gs_max_vert_out <= 1024);
cut_mode = V_028A40_GS_CUT_1024;
}
if (gs_max_vert_out <= 128) {
cut_mode = V_028A40_GS_CUT_128;
} else if (gs_max_vert_out <= 256) {
cut_mode = V_028A40_GS_CUT_256;
} else if (gs_max_vert_out <= 512) {
cut_mode = V_028A40_GS_CUT_512;
} else {
assert(gs_max_vert_out <= 1024);
cut_mode = V_028A40_GS_CUT_1024;
}
return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
S_028A40_CUT_MODE(cut_mode)|
S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) |
S_028A40_GS_WRITE_OPTIMIZE(1) |
S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
}
/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
/// value for LLVM8+ tbuffer intrinsics.
unsigned
ac_get_tbuffer_format(enum chip_class chip_class,
unsigned dfmt, unsigned nfmt)
unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt)
{
// Some games try to access vertex buffers without a valid format.
// This is a game bug, but we should still handle it gracefully.
if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
return V_008F0C_IMG_FORMAT_INVALID;
// Some games try to access vertex buffers without a valid format.
// This is a game bug, but we should still handle it gracefully.
if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
return V_008F0C_IMG_FORMAT_INVALID;
if (chip_class >= GFX10) {
unsigned format;
switch (dfmt) {
default: unreachable("bad dfmt");
case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break;
case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break;
}
if (chip_class >= GFX10) {
unsigned format;
switch (dfmt) {
default:
unreachable("bad dfmt");
case V_008F0C_BUF_DATA_FORMAT_INVALID:
format = V_008F0C_IMG_FORMAT_INVALID;
break;
case V_008F0C_BUF_DATA_FORMAT_8:
format = V_008F0C_IMG_FORMAT_8_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_8_8:
format = V_008F0C_IMG_FORMAT_8_8_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_16:
format = V_008F0C_IMG_FORMAT_16_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_16_16:
format = V_008F0C_IMG_FORMAT_16_16_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_32:
format = V_008F0C_IMG_FORMAT_32_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_32_32:
format = V_008F0C_IMG_FORMAT_32_32_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_32_32_32:
format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT;
break;
case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT;
break;
}
// Use the regularity properties of the combined format enum.
//
// Note: float is incompatible with 8-bit data formats,
// [us]{norm,scaled} are incomparible with 32-bit data formats.
// [us]scaled are not writable.
switch (nfmt) {
case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
default: unreachable("bad nfmt");
case V_008F0C_BUF_NUM_FORMAT_UINT: break;
case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
}
// Use the regularity properties of the combined format enum.
//
// Note: float is incompatible with 8-bit data formats,
// [us]{norm,scaled} are incomparible with 32-bit data formats.
// [us]scaled are not writable.
switch (nfmt) {
case V_008F0C_BUF_NUM_FORMAT_UNORM:
format -= 4;
break;
case V_008F0C_BUF_NUM_FORMAT_SNORM:
format -= 3;
break;
case V_008F0C_BUF_NUM_FORMAT_USCALED:
format -= 2;
break;
case V_008F0C_BUF_NUM_FORMAT_SSCALED:
format -= 1;
break;
default:
unreachable("bad nfmt");
case V_008F0C_BUF_NUM_FORMAT_UINT:
break;
case V_008F0C_BUF_NUM_FORMAT_SINT:
format += 1;
break;
case V_008F0C_BUF_NUM_FORMAT_FLOAT:
format += 2;
break;
}
return format;
} else {
return dfmt | (nfmt << 4);
}
return format;
} else {
return dfmt | (nfmt << 4);
}
}
static const struct ac_data_format_info data_format_table[] = {
[V_008F0C_BUF_DATA_FORMAT_INVALID] = { 0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID },
[V_008F0C_BUF_DATA_FORMAT_8] = { 1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8 },
[V_008F0C_BUF_DATA_FORMAT_16] = { 2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16 },
[V_008F0C_BUF_DATA_FORMAT_8_8] = { 2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8 },
[V_008F0C_BUF_DATA_FORMAT_32] = { 4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32 },
[V_008F0C_BUF_DATA_FORMAT_16_16] = { 4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16 },
[V_008F0C_BUF_DATA_FORMAT_10_11_11] = { 4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11 },
[V_008F0C_BUF_DATA_FORMAT_11_11_10] = { 4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10 },
[V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = { 4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2 },
[V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = { 4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10 },
[V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = { 4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8 },
[V_008F0C_BUF_DATA_FORMAT_32_32] = { 8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32 },
[V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = { 8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16 },
[V_008F0C_BUF_DATA_FORMAT_32_32_32] = { 12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32 },
[V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = { 16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32 },
[V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
[V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
[V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
[V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
[V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
[V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
[V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
[V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
[V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
[V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
[V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
[V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
[V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
[V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
[V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
};
const struct ac_data_format_info *
ac_get_data_format_info(unsigned dfmt)
const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
{
assert(dfmt < ARRAY_SIZE(data_format_table));
return &data_format_table[dfmt];
assert(dfmt < ARRAY_SIZE(data_format_table));
return &data_format_table[dfmt];
}
enum ac_image_dim
ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
bool is_array)
enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
bool is_array)
{
switch (dim) {
case GLSL_SAMPLER_DIM_1D:
if (chip_class == GFX9)
return is_array ? ac_image_2darray : ac_image_2d;
return is_array ? ac_image_1darray : ac_image_1d;
case GLSL_SAMPLER_DIM_2D:
case GLSL_SAMPLER_DIM_RECT:
case GLSL_SAMPLER_DIM_EXTERNAL:
return is_array ? ac_image_2darray : ac_image_2d;
case GLSL_SAMPLER_DIM_3D:
return ac_image_3d;
case GLSL_SAMPLER_DIM_CUBE:
return ac_image_cube;
case GLSL_SAMPLER_DIM_MS:
return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
case GLSL_SAMPLER_DIM_SUBPASS:
return ac_image_2darray;
case GLSL_SAMPLER_DIM_SUBPASS_MS:
return ac_image_2darraymsaa;
default:
unreachable("bad sampler dim");
}
switch (dim) {
case GLSL_SAMPLER_DIM_1D:
if (chip_class == GFX9)
return is_array ? ac_image_2darray : ac_image_2d;
return is_array ? ac_image_1darray : ac_image_1d;
case GLSL_SAMPLER_DIM_2D:
case GLSL_SAMPLER_DIM_RECT:
case GLSL_SAMPLER_DIM_EXTERNAL:
return is_array ? ac_image_2darray : ac_image_2d;
case GLSL_SAMPLER_DIM_3D:
return ac_image_3d;
case GLSL_SAMPLER_DIM_CUBE:
return ac_image_cube;
case GLSL_SAMPLER_DIM_MS:
return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
case GLSL_SAMPLER_DIM_SUBPASS:
return ac_image_2darray;
case GLSL_SAMPLER_DIM_SUBPASS_MS:
return ac_image_2darraymsaa;
default:
unreachable("bad sampler dim");
}
}
enum ac_image_dim
ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
bool is_array)
enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
bool is_array)
{
enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
/* Match the resource type set in the descriptor. */
if (dim == ac_image_cube ||
(chip_class <= GFX8 && dim == ac_image_3d))
dim = ac_image_2darray;
else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
/* When a single layer of a 3D texture is bound, the shader
* will refer to a 2D target, but the descriptor has a 3D type.
* Since the HW ignores BASE_ARRAY in this case, we need to
* send 3 coordinates. This doesn't hurt when the underlying
* texture is non-3D.
*/
dim = ac_image_3d;
}
/* Match the resource type set in the descriptor. */
if (dim == ac_image_cube || (chip_class <= GFX8 && dim == ac_image_3d))
dim = ac_image_2darray;
else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
/* When a single layer of a 3D texture is bound, the shader
* will refer to a 2D target, but the descriptor has a 3D type.
* Since the HW ignores BASE_ARRAY in this case, we need to
* send 3 coordinates. This doesn't hurt when the underlying
* texture is non-3D.
*/
dim = ac_image_3d;
}
return dim;
return dim;
}
unsigned
ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
signed char *face_vgpr_index_ptr,
signed char *ancillary_vgpr_index_ptr)
unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
signed char *face_vgpr_index_ptr,
signed char *ancillary_vgpr_index_ptr)
{
unsigned num_input_vgprs = 0;
signed char face_vgpr_index = -1;
signed char ancillary_vgpr_index = -1;
unsigned num_input_vgprs = 0;
signed char face_vgpr_index = -1;
signed char ancillary_vgpr_index = -1;
if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
num_input_vgprs += 3;
if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
face_vgpr_index = num_input_vgprs;
num_input_vgprs += 1;
}
if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
ancillary_vgpr_index = num_input_vgprs;
num_input_vgprs += 1;
}
if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
num_input_vgprs += 3;
if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
num_input_vgprs += 2;
if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
face_vgpr_index = num_input_vgprs;
num_input_vgprs += 1;
}
if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
ancillary_vgpr_index = num_input_vgprs;
num_input_vgprs += 1;
}
if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
num_input_vgprs += 1;
if (face_vgpr_index_ptr)
*face_vgpr_index_ptr = face_vgpr_index;
if (ancillary_vgpr_index_ptr)
*ancillary_vgpr_index_ptr = ancillary_vgpr_index;
if (face_vgpr_index_ptr)
*face_vgpr_index_ptr = face_vgpr_index;
if (ancillary_vgpr_index_ptr)
*ancillary_vgpr_index_ptr = ancillary_vgpr_index;
return num_input_vgprs;
return num_input_vgprs;
}
void ac_choose_spi_color_formats(unsigned format, unsigned swap,
unsigned ntype, bool is_depth,
struct ac_spi_color_formats *formats)
void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth,
struct ac_spi_color_formats *formats)
{
/* Alpha is needed for alpha-to-coverage.
* Blending may be with or without alpha.

View file

@ -24,75 +24,64 @@
#ifndef AC_SHADER_UTIL_H
#define AC_SHADER_UTIL_H
#include "ac_binary.h"
#include "amd_family.h"
#include "compiler/nir/nir.h"
#include <stdbool.h>
#include <stdint.h>
#include "amd_family.h"
#include "ac_binary.h"
#include "compiler/nir/nir.h"
#ifdef __cplusplus
extern "C" {
#endif
enum ac_image_dim {
ac_image_1d,
ac_image_2d,
ac_image_3d,
ac_image_cube, // includes cube arrays
ac_image_1darray,
ac_image_2darray,
ac_image_2dmsaa,
ac_image_2darraymsaa,
enum ac_image_dim
{
ac_image_1d,
ac_image_2d,
ac_image_3d,
ac_image_cube, // includes cube arrays
ac_image_1darray,
ac_image_2darray,
ac_image_2dmsaa,
ac_image_2darraymsaa,
};
struct ac_data_format_info {
uint8_t element_size;
uint8_t num_channels;
uint8_t chan_byte_size;
uint8_t chan_format;
uint8_t element_size;
uint8_t num_channels;
uint8_t chan_byte_size;
uint8_t chan_format;
};
struct ac_spi_color_formats {
unsigned normal : 8;
unsigned alpha : 8;
unsigned blend : 8;
unsigned blend_alpha : 8;
unsigned normal : 8;
unsigned alpha : 8;
unsigned blend : 8;
unsigned blend_alpha : 8;
};
unsigned
ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
bool writes_samplemask);
unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask);
unsigned
ac_get_cb_shader_mask(unsigned spi_shader_col_format);
unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format);
uint32_t
ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class);
uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class);
unsigned
ac_get_tbuffer_format(enum chip_class chip_class,
unsigned dfmt, unsigned nfmt);
unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt);
const struct ac_data_format_info *
ac_get_data_format_info(unsigned dfmt);
const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt);
enum ac_image_dim
ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
bool is_array);
enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
bool is_array);
enum ac_image_dim
ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
bool is_array);
enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
bool is_array);
unsigned
ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
signed char *face_vgpr_index,
signed char *ancillary_vgpr_index);
unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
signed char *face_vgpr_index, signed char *ancillary_vgpr_index);
void ac_choose_spi_color_formats(unsigned format, unsigned swap,
unsigned ntype, bool is_depth,
struct ac_spi_color_formats *formats);
void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth,
struct ac_spi_color_formats *formats);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load diff

View file

@ -35,7 +35,8 @@ struct ac_reg_range {
unsigned size;
};
enum ac_reg_range_type {
enum ac_reg_range_type
{
SI_REG_RANGE_UCONFIG,
SI_REG_RANGE_CONTEXT,
SI_REG_RANGE_SH,
@ -46,14 +47,13 @@ enum ac_reg_range_type {
SI_NUM_ALL_REG_RANGES,
};
typedef void (*set_context_reg_seq_array_fn)(struct radeon_cmdbuf *cs, unsigned reg,
unsigned num, const uint32_t *values);
typedef void (*set_context_reg_seq_array_fn)(struct radeon_cmdbuf *cs, unsigned reg, unsigned num,
const uint32_t *values);
void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
enum ac_reg_range_type type, unsigned *num_ranges,
const struct ac_reg_range **ranges);
void ac_emulate_clear_state(const struct radeon_info *info,
struct radeon_cmdbuf *cs,
void ac_emulate_clear_state(const struct radeon_info *info, struct radeon_cmdbuf *cs,
set_context_reg_seq_array_fn set_context_reg_seq_array);
void ac_check_shadowed_regs(enum chip_class chip_class, enum radeon_family family,
unsigned reg_offset, unsigned count);

File diff suppressed because it is too large Load diff

View file

@ -26,11 +26,11 @@
#ifndef AC_SURFACE_H
#define AC_SURFACE_H
#include <stdint.h>
#include <stdbool.h>
#include "amd_family.h"
#include <stdbool.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
@ -41,280 +41,274 @@ struct ac_addrlib;
struct amdgpu_gpu_info;
struct radeon_info;
#define RADEON_SURF_MAX_LEVELS 15
#define RADEON_SURF_MAX_LEVELS 15
enum radeon_surf_mode {
RADEON_SURF_MODE_LINEAR_ALIGNED = 1,
RADEON_SURF_MODE_1D = 2,
RADEON_SURF_MODE_2D = 3,
enum radeon_surf_mode
{
RADEON_SURF_MODE_LINEAR_ALIGNED = 1,
RADEON_SURF_MODE_1D = 2,
RADEON_SURF_MODE_2D = 3,
};
/* This describes D/S/Z/R swizzle modes.
* Defined in the GB_TILE_MODEn.MICRO_TILE_MODE_NEW order.
*/
enum radeon_micro_mode {
RADEON_MICRO_MODE_DISPLAY = 0,
RADEON_MICRO_MODE_STANDARD = 1,
RADEON_MICRO_MODE_DEPTH = 2,
RADEON_MICRO_MODE_RENDER = 3, /* gfx9 and older: rotated */
enum radeon_micro_mode
{
RADEON_MICRO_MODE_DISPLAY = 0,
RADEON_MICRO_MODE_STANDARD = 1,
RADEON_MICRO_MODE_DEPTH = 2,
RADEON_MICRO_MODE_RENDER = 3, /* gfx9 and older: rotated */
};
/* the first 16 bits are reserved for libdrm_radeon, don't use them */
#define RADEON_SURF_SCANOUT (1 << 16)
#define RADEON_SURF_ZBUFFER (1 << 17)
#define RADEON_SURF_SBUFFER (1 << 18)
#define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
#define RADEON_SURF_SCANOUT (1 << 16)
#define RADEON_SURF_ZBUFFER (1 << 17)
#define RADEON_SURF_SBUFFER (1 << 18)
#define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
/* bits 19 and 20 are reserved for libdrm_radeon, don't use them */
#define RADEON_SURF_FMASK (1 << 21)
#define RADEON_SURF_DISABLE_DCC (1 << 22)
#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23)
#define RADEON_SURF_IMPORTED (1 << 24)
#define RADEON_SURF_CONTIGUOUS_DCC_LAYERS (1 << 25)
#define RADEON_SURF_SHAREABLE (1 << 26)
#define RADEON_SURF_NO_RENDER_TARGET (1 << 27)
#define RADEON_SURF_FMASK (1 << 21)
#define RADEON_SURF_DISABLE_DCC (1 << 22)
#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23)
#define RADEON_SURF_IMPORTED (1 << 24)
#define RADEON_SURF_CONTIGUOUS_DCC_LAYERS (1 << 25)
#define RADEON_SURF_SHAREABLE (1 << 26)
#define RADEON_SURF_NO_RENDER_TARGET (1 << 27)
/* Force a swizzle mode (gfx9+) or tile mode (gfx6-8).
* If this is not set, optimize for space. */
#define RADEON_SURF_FORCE_SWIZZLE_MODE (1 << 28)
#define RADEON_SURF_NO_FMASK (1 << 29)
#define RADEON_SURF_NO_HTILE (1 << 30)
#define RADEON_SURF_FORCE_MICRO_TILE_MODE (1u << 31)
#define RADEON_SURF_FORCE_SWIZZLE_MODE (1 << 28)
#define RADEON_SURF_NO_FMASK (1 << 29)
#define RADEON_SURF_NO_HTILE (1 << 30)
#define RADEON_SURF_FORCE_MICRO_TILE_MODE (1u << 31)
struct legacy_surf_level {
uint64_t offset;
uint32_t slice_size_dw; /* in dwords; max = 4GB / 4. */
uint32_t dcc_offset; /* relative offset within DCC mip tree */
uint32_t dcc_fast_clear_size;
uint32_t dcc_slice_fast_clear_size;
unsigned nblk_x:15;
unsigned nblk_y:15;
enum radeon_surf_mode mode:2;
uint64_t offset;
uint32_t slice_size_dw; /* in dwords; max = 4GB / 4. */
uint32_t dcc_offset; /* relative offset within DCC mip tree */
uint32_t dcc_fast_clear_size;
uint32_t dcc_slice_fast_clear_size;
unsigned nblk_x : 15;
unsigned nblk_y : 15;
enum radeon_surf_mode mode : 2;
};
struct legacy_surf_fmask {
unsigned slice_tile_max; /* max 4M */
uint8_t tiling_index; /* max 31 */
uint8_t bankh; /* max 8 */
uint16_t pitch_in_pixels;
uint64_t slice_size;
unsigned slice_tile_max; /* max 4M */
uint8_t tiling_index; /* max 31 */
uint8_t bankh; /* max 8 */
uint16_t pitch_in_pixels;
uint64_t slice_size;
};
struct legacy_surf_layout {
unsigned bankw:4; /* max 8 */
unsigned bankh:4; /* max 8 */
unsigned mtilea:4; /* max 8 */
unsigned tile_split:13; /* max 4K */
unsigned stencil_tile_split:13; /* max 4K */
unsigned pipe_config:5; /* max 17 */
unsigned num_banks:5; /* max 16 */
unsigned macro_tile_index:4; /* max 15 */
unsigned bankw : 4; /* max 8 */
unsigned bankh : 4; /* max 8 */
unsigned mtilea : 4; /* max 8 */
unsigned tile_split : 13; /* max 4K */
unsigned stencil_tile_split : 13; /* max 4K */
unsigned pipe_config : 5; /* max 17 */
unsigned num_banks : 5; /* max 16 */
unsigned macro_tile_index : 4; /* max 15 */
/* Whether the depth miptree or stencil miptree as used by the DB are
* adjusted from their TC compatible form to ensure depth/stencil
* compatibility. If either is true, the corresponding plane cannot be
* sampled from.
*/
unsigned depth_adjusted:1;
unsigned stencil_adjusted:1;
/* Whether the depth miptree or stencil miptree as used by the DB are
* adjusted from their TC compatible form to ensure depth/stencil
* compatibility. If either is true, the corresponding plane cannot be
* sampled from.
*/
unsigned depth_adjusted : 1;
unsigned stencil_adjusted : 1;
struct legacy_surf_level level[RADEON_SURF_MAX_LEVELS];
struct legacy_surf_level stencil_level[RADEON_SURF_MAX_LEVELS];
uint8_t tiling_index[RADEON_SURF_MAX_LEVELS];
uint8_t stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
struct legacy_surf_fmask fmask;
unsigned cmask_slice_tile_max;
struct legacy_surf_level level[RADEON_SURF_MAX_LEVELS];
struct legacy_surf_level stencil_level[RADEON_SURF_MAX_LEVELS];
uint8_t tiling_index[RADEON_SURF_MAX_LEVELS];
uint8_t stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
struct legacy_surf_fmask fmask;
unsigned cmask_slice_tile_max;
};
/* Same as addrlib - AddrResourceType. */
enum gfx9_resource_type {
RADEON_RESOURCE_1D = 0,
RADEON_RESOURCE_2D,
RADEON_RESOURCE_3D,
enum gfx9_resource_type
{
RADEON_RESOURCE_1D = 0,
RADEON_RESOURCE_2D,
RADEON_RESOURCE_3D,
};
struct gfx9_surf_flags {
uint16_t swizzle_mode; /* tile mode */
uint16_t epitch; /* (pitch - 1) or (height - 1) */
uint16_t swizzle_mode; /* tile mode */
uint16_t epitch; /* (pitch - 1) or (height - 1) */
};
struct gfx9_surf_meta_flags {
unsigned rb_aligned:1; /* optimal for RBs */
unsigned pipe_aligned:1; /* optimal for TC */
unsigned independent_64B_blocks:1;
unsigned independent_128B_blocks:1;
unsigned max_compressed_block_size:2;
unsigned rb_aligned : 1; /* optimal for RBs */
unsigned pipe_aligned : 1; /* optimal for TC */
unsigned independent_64B_blocks : 1;
unsigned independent_128B_blocks : 1;
unsigned max_compressed_block_size : 2;
};
struct gfx9_surf_layout {
struct gfx9_surf_flags surf; /* color or depth surface */
struct gfx9_surf_flags fmask; /* not added to surf_size */
struct gfx9_surf_flags stencil; /* added to surf_size, use stencil_offset */
struct gfx9_surf_flags surf; /* color or depth surface */
struct gfx9_surf_flags fmask; /* not added to surf_size */
struct gfx9_surf_flags stencil; /* added to surf_size, use stencil_offset */
struct gfx9_surf_meta_flags dcc; /* metadata of color */
struct gfx9_surf_meta_flags dcc; /* metadata of color */
enum gfx9_resource_type resource_type; /* 1D, 2D or 3D */
uint16_t surf_pitch; /* in blocks */
uint16_t surf_height;
enum gfx9_resource_type resource_type; /* 1D, 2D or 3D */
uint16_t surf_pitch; /* in blocks */
uint16_t surf_height;
uint64_t surf_offset; /* 0 unless imported with an offset */
/* The size of the 2D plane containing all mipmap levels. */
uint64_t surf_slice_size;
/* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
uint32_t offset[RADEON_SURF_MAX_LEVELS];
/* Mipmap level pitch in elements. Only valid for LINEAR. */
uint16_t pitch[RADEON_SURF_MAX_LEVELS];
uint64_t surf_offset; /* 0 unless imported with an offset */
/* The size of the 2D plane containing all mipmap levels. */
uint64_t surf_slice_size;
/* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
uint32_t offset[RADEON_SURF_MAX_LEVELS];
/* Mipmap level pitch in elements. Only valid for LINEAR. */
uint16_t pitch[RADEON_SURF_MAX_LEVELS];
uint64_t stencil_offset; /* separate stencil */
uint64_t stencil_offset; /* separate stencil */
uint8_t dcc_block_width;
uint8_t dcc_block_height;
uint8_t dcc_block_depth;
uint8_t dcc_block_width;
uint8_t dcc_block_height;
uint8_t dcc_block_depth;
/* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0.
* The 3D engine doesn't support that layout except for chips with 1 RB.
* All other chips must set rb_aligned=1.
* A compute shader needs to convert from aligned DCC to unaligned.
*/
uint32_t display_dcc_size;
uint32_t display_dcc_alignment;
uint16_t display_dcc_pitch_max; /* (mip chain pitch - 1) */
bool dcc_retile_use_uint16; /* if all values fit into uint16_t */
uint32_t dcc_retile_num_elements;
void *dcc_retile_map;
/* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0.
* The 3D engine doesn't support that layout except for chips with 1 RB.
* All other chips must set rb_aligned=1.
* A compute shader needs to convert from aligned DCC to unaligned.
*/
uint32_t display_dcc_size;
uint32_t display_dcc_alignment;
uint16_t display_dcc_pitch_max; /* (mip chain pitch - 1) */
bool dcc_retile_use_uint16; /* if all values fit into uint16_t */
uint32_t dcc_retile_num_elements;
void *dcc_retile_map;
};
struct radeon_surf {
/* Format properties. */
unsigned blk_w:4;
unsigned blk_h:4;
unsigned bpe:5;
/* Number of mipmap levels where DCC is enabled starting from level 0.
* Non-zero levels may be disabled due to alignment constraints, but not
* the first level.
*/
unsigned num_dcc_levels:4;
unsigned is_linear:1;
unsigned has_stencil:1;
/* This might be true even if micro_tile_mode isn't displayable or rotated. */
unsigned is_displayable:1;
/* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */
unsigned micro_tile_mode:3;
uint32_t flags;
/* Format properties. */
unsigned blk_w : 4;
unsigned blk_h : 4;
unsigned bpe : 5;
/* Number of mipmap levels where DCC is enabled starting from level 0.
* Non-zero levels may be disabled due to alignment constraints, but not
* the first level.
*/
unsigned num_dcc_levels : 4;
unsigned is_linear : 1;
unsigned has_stencil : 1;
/* This might be true even if micro_tile_mode isn't displayable or rotated. */
unsigned is_displayable : 1;
/* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */
unsigned micro_tile_mode : 3;
uint32_t flags;
/* These are return values. Some of them can be set by the caller, but
* they will be treated as hints (e.g. bankw, bankh) and might be
* changed by the calculator.
*/
/* These are return values. Some of them can be set by the caller, but
* they will be treated as hints (e.g. bankw, bankh) and might be
* changed by the calculator.
*/
/* Tile swizzle can be OR'd with low bits of the BASE_256B address.
* The value is the same for all mipmap levels. Supported tile modes:
* - GFX6: Only macro tiling.
* - GFX9: Only *_X and *_T swizzle modes. Level 0 must not be in the mip
* tail.
*
* Only these surfaces are allowed to set it:
* - color (if it doesn't have to be displayable)
* - DCC (same tile swizzle as color)
* - FMASK
* - CMASK if it's TC-compatible or if the gen is GFX9
* - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9
*/
uint8_t tile_swizzle;
uint8_t fmask_tile_swizzle;
/* Tile swizzle can be OR'd with low bits of the BASE_256B address.
* The value is the same for all mipmap levels. Supported tile modes:
* - GFX6: Only macro tiling.
* - GFX9: Only *_X and *_T swizzle modes. Level 0 must not be in the mip
* tail.
*
* Only these surfaces are allowed to set it:
* - color (if it doesn't have to be displayable)
* - DCC (same tile swizzle as color)
* - FMASK
* - CMASK if it's TC-compatible or if the gen is GFX9
* - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9
*/
uint8_t tile_swizzle;
uint8_t fmask_tile_swizzle;
uint64_t surf_size;
uint64_t fmask_size;
uint32_t surf_alignment;
uint32_t fmask_alignment;
uint64_t surf_size;
uint64_t fmask_size;
uint32_t surf_alignment;
uint32_t fmask_alignment;
/* DCC and HTILE are very small. */
uint32_t dcc_size;
uint32_t dcc_slice_size;
uint32_t dcc_alignment;
/* DCC and HTILE are very small. */
uint32_t dcc_size;
uint32_t dcc_slice_size;
uint32_t dcc_alignment;
uint32_t htile_size;
uint32_t htile_slice_size;
uint32_t htile_alignment;
uint32_t htile_size;
uint32_t htile_slice_size;
uint32_t htile_alignment;
uint32_t cmask_size;
uint32_t cmask_slice_size;
uint32_t cmask_alignment;
uint32_t cmask_size;
uint32_t cmask_slice_size;
uint32_t cmask_alignment;
/* All buffers combined. */
uint64_t htile_offset;
uint64_t fmask_offset;
uint64_t cmask_offset;
uint64_t dcc_offset;
uint64_t display_dcc_offset;
uint64_t dcc_retile_map_offset;
uint64_t total_size;
uint32_t alignment;
/* All buffers combined. */
uint64_t htile_offset;
uint64_t fmask_offset;
uint64_t cmask_offset;
uint64_t dcc_offset;
uint64_t display_dcc_offset;
uint64_t dcc_retile_map_offset;
uint64_t total_size;
uint32_t alignment;
union {
/* Return values for GFX8 and older.
*
* Some of them can be set by the caller if certain parameters are
* desirable. The allocator will try to obey them.
*/
struct legacy_surf_layout legacy;
union {
/* Return values for GFX8 and older.
*
* Some of them can be set by the caller if certain parameters are
* desirable. The allocator will try to obey them.
*/
struct legacy_surf_layout legacy;
/* GFX9+ return values. */
struct gfx9_surf_layout gfx9;
} u;
/* GFX9+ return values. */
struct gfx9_surf_layout gfx9;
} u;
};
struct ac_surf_info {
uint32_t width;
uint32_t height;
uint32_t depth;
uint8_t samples; /* For Z/S: samples; For color: FMASK coverage samples */
uint8_t storage_samples; /* For color: allocated samples */
uint8_t levels;
uint8_t num_channels; /* heuristic for displayability */
uint16_t array_size;
uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */
uint32_t *fmask_surf_index;
uint32_t width;
uint32_t height;
uint32_t depth;
uint8_t samples; /* For Z/S: samples; For color: FMASK coverage samples */
uint8_t storage_samples; /* For color: allocated samples */
uint8_t levels;
uint8_t num_channels; /* heuristic for displayability */
uint16_t array_size;
uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */
uint32_t *fmask_surf_index;
};
struct ac_surf_config {
struct ac_surf_info info;
unsigned is_1d : 1;
unsigned is_3d : 1;
unsigned is_cube : 1;
struct ac_surf_info info;
unsigned is_1d : 1;
unsigned is_3d : 1;
unsigned is_cube : 1;
};
struct ac_addrlib *ac_addrlib_create(const struct radeon_info *info,
const struct amdgpu_gpu_info *amdinfo,
uint64_t *max_alignment);
const struct amdgpu_gpu_info *amdinfo,
uint64_t *max_alignment);
void ac_addrlib_destroy(struct ac_addrlib *addrlib);
int ac_compute_surface(struct ac_addrlib *addrlib, const struct radeon_info *info,
const struct ac_surf_config * config,
enum radeon_surf_mode mode,
struct radeon_surf *surf);
const struct ac_surf_config *config, enum radeon_surf_mode mode,
struct radeon_surf *surf);
void ac_surface_zero_dcc_fields(struct radeon_surf *surf);
void ac_surface_set_bo_metadata(const struct radeon_info *info,
struct radeon_surf *surf, uint64_t tiling_flags,
enum radeon_surf_mode *mode);
void ac_surface_get_bo_metadata(const struct radeon_info *info,
struct radeon_surf *surf, uint64_t *tiling_flags);
void ac_surface_set_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf,
uint64_t tiling_flags, enum radeon_surf_mode *mode);
void ac_surface_get_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf,
uint64_t *tiling_flags);
bool ac_surface_set_umd_metadata(const struct radeon_info *info,
struct radeon_surf *surf,
unsigned num_storage_samples,
unsigned num_mipmap_levels,
unsigned size_metadata,
uint32_t metadata[64]);
void ac_surface_get_umd_metadata(const struct radeon_info *info,
struct radeon_surf *surf,
unsigned num_mipmap_levels,
uint32_t desc[8],
bool ac_surface_set_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf,
unsigned num_storage_samples, unsigned num_mipmap_levels,
unsigned size_metadata, uint32_t metadata[64]);
void ac_surface_get_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf,
unsigned num_mipmap_levels, uint32_t desc[8],
unsigned *size_metadata, uint32_t metadata[64]);
void ac_surface_override_offset_stride(const struct radeon_info *info,
struct radeon_surf *surf,
unsigned num_mipmap_levels,
uint64_t offset, unsigned pitch);
void ac_surface_override_offset_stride(const struct radeon_info *info, struct radeon_surf *surf,
unsigned num_mipmap_levels, uint64_t offset, unsigned pitch);
#ifdef __cplusplus
}

View file

@ -24,117 +24,120 @@
#ifndef AMD_FAMILY_H
#define AMD_FAMILY_H
enum radeon_family {
CHIP_UNKNOWN = 0,
CHIP_R300, /* R3xx-based cores. (GFX2) */
CHIP_R350,
CHIP_RV350,
CHIP_RV370,
CHIP_RV380,
CHIP_RS400,
CHIP_RC410,
CHIP_RS480,
CHIP_R420, /* R4xx-based cores. (GFX2) */
CHIP_R423,
CHIP_R430,
CHIP_R480,
CHIP_R481,
CHIP_RV410,
CHIP_RS600,
CHIP_RS690,
CHIP_RS740,
CHIP_RV515, /* R5xx-based cores. (GFX2) */
CHIP_R520,
CHIP_RV530,
CHIP_R580,
CHIP_RV560,
CHIP_RV570,
CHIP_R600, /* GFX3 (R6xx) */
CHIP_RV610,
CHIP_RV630,
CHIP_RV670,
CHIP_RV620,
CHIP_RV635,
CHIP_RS780,
CHIP_RS880,
CHIP_RV770, /* GFX3 (R7xx) */
CHIP_RV730,
CHIP_RV710,
CHIP_RV740,
CHIP_CEDAR, /* GFX4 (Evergreen) */
CHIP_REDWOOD,
CHIP_JUNIPER,
CHIP_CYPRESS,
CHIP_HEMLOCK,
CHIP_PALM,
CHIP_SUMO,
CHIP_SUMO2,
CHIP_BARTS,
CHIP_TURKS,
CHIP_CAICOS,
CHIP_CAYMAN, /* GFX5 (Northern Islands) */
CHIP_ARUBA,
CHIP_TAHITI, /* GFX6 (Southern Islands) */
CHIP_PITCAIRN,
CHIP_VERDE,
CHIP_OLAND,
CHIP_HAINAN,
CHIP_BONAIRE, /* GFX7 (Sea Islands) */
CHIP_KAVERI,
CHIP_KABINI,
CHIP_HAWAII,
CHIP_TONGA, /* GFX8 (Volcanic Islands & Polaris) */
CHIP_ICELAND,
CHIP_CARRIZO,
CHIP_FIJI,
CHIP_STONEY,
CHIP_POLARIS10,
CHIP_POLARIS11,
CHIP_POLARIS12,
CHIP_VEGAM,
CHIP_VEGA10, /* GFX9 (Vega) */
CHIP_VEGA12,
CHIP_VEGA20,
CHIP_RAVEN,
CHIP_RAVEN2,
CHIP_RENOIR,
CHIP_ARCTURUS,
CHIP_NAVI10,
CHIP_NAVI12,
CHIP_NAVI14,
CHIP_SIENNA_CICHLID,
CHIP_NAVY_FLOUNDER,
CHIP_LAST,
enum radeon_family
{
CHIP_UNKNOWN = 0,
CHIP_R300, /* R3xx-based cores. (GFX2) */
CHIP_R350,
CHIP_RV350,
CHIP_RV370,
CHIP_RV380,
CHIP_RS400,
CHIP_RC410,
CHIP_RS480,
CHIP_R420, /* R4xx-based cores. (GFX2) */
CHIP_R423,
CHIP_R430,
CHIP_R480,
CHIP_R481,
CHIP_RV410,
CHIP_RS600,
CHIP_RS690,
CHIP_RS740,
CHIP_RV515, /* R5xx-based cores. (GFX2) */
CHIP_R520,
CHIP_RV530,
CHIP_R580,
CHIP_RV560,
CHIP_RV570,
CHIP_R600, /* GFX3 (R6xx) */
CHIP_RV610,
CHIP_RV630,
CHIP_RV670,
CHIP_RV620,
CHIP_RV635,
CHIP_RS780,
CHIP_RS880,
CHIP_RV770, /* GFX3 (R7xx) */
CHIP_RV730,
CHIP_RV710,
CHIP_RV740,
CHIP_CEDAR, /* GFX4 (Evergreen) */
CHIP_REDWOOD,
CHIP_JUNIPER,
CHIP_CYPRESS,
CHIP_HEMLOCK,
CHIP_PALM,
CHIP_SUMO,
CHIP_SUMO2,
CHIP_BARTS,
CHIP_TURKS,
CHIP_CAICOS,
CHIP_CAYMAN, /* GFX5 (Northern Islands) */
CHIP_ARUBA,
CHIP_TAHITI, /* GFX6 (Southern Islands) */
CHIP_PITCAIRN,
CHIP_VERDE,
CHIP_OLAND,
CHIP_HAINAN,
CHIP_BONAIRE, /* GFX7 (Sea Islands) */
CHIP_KAVERI,
CHIP_KABINI,
CHIP_HAWAII,
CHIP_TONGA, /* GFX8 (Volcanic Islands & Polaris) */
CHIP_ICELAND,
CHIP_CARRIZO,
CHIP_FIJI,
CHIP_STONEY,
CHIP_POLARIS10,
CHIP_POLARIS11,
CHIP_POLARIS12,
CHIP_VEGAM,
CHIP_VEGA10, /* GFX9 (Vega) */
CHIP_VEGA12,
CHIP_VEGA20,
CHIP_RAVEN,
CHIP_RAVEN2,
CHIP_RENOIR,
CHIP_ARCTURUS,
CHIP_NAVI10,
CHIP_NAVI12,
CHIP_NAVI14,
CHIP_SIENNA_CICHLID,
CHIP_NAVY_FLOUNDER,
CHIP_LAST,
};
enum chip_class {
CLASS_UNKNOWN = 0,
R300,
R400,
R500,
R600,
R700,
EVERGREEN,
CAYMAN,
GFX6,
GFX7,
GFX8,
GFX9,
GFX10,
GFX10_3,
enum chip_class
{
CLASS_UNKNOWN = 0,
R300,
R400,
R500,
R600,
R700,
EVERGREEN,
CAYMAN,
GFX6,
GFX7,
GFX8,
GFX9,
GFX10,
GFX10_3,
};
enum ring_type {
RING_GFX = 0,
RING_COMPUTE,
RING_DMA,
RING_UVD,
RING_VCE,
RING_UVD_ENC,
RING_VCN_DEC,
RING_VCN_ENC,
RING_VCN_JPEG,
NUM_RING_TYPES,
enum ring_type
{
RING_GFX = 0,
RING_COMPUTE,
RING_DMA,
RING_UVD,
RING_VCE,
RING_UVD_ENC,
RING_VCN_DEC,
RING_VCN_ENC,
RING_VCN_JPEG,
NUM_RING_TYPES,
};
#endif

View file

@ -30,13 +30,12 @@
//---------------------------------------------------------------------------//
// Sets val bits for specified mask in specified dst packed instance.
#define AMD_HSA_BITS_SET(dst, mask, val) \
dst &= (~(1 << mask ## _SHIFT) & ~mask); \
dst |= (((val) << mask ## _SHIFT) & mask)
#define AMD_HSA_BITS_SET(dst, mask, val) \
dst &= (~(1 << mask##_SHIFT) & ~mask); \
dst |= (((val) << mask##_SHIFT) & mask)
// Gets bits for specified mask from specified src packed instance.
#define AMD_HSA_BITS_GET(src, mask) \
((src & mask) >> mask ## _SHIFT)
#define AMD_HSA_BITS_GET(src, mask) ((src & mask) >> mask##_SHIFT)
/* Every amd_*_code_t has the following properties, which are composed of
* a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
@ -47,132 +46,164 @@
* implementation defined in the C standard and so cannot be used to
* specify an ABI)
*/
enum amd_code_property_mask_t {
enum amd_code_property_mask_t
{
/* Enable the setup of the SGPR user data registers
* (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
* for initial register state.
*
* The total number of SGPRuser data registers requested must not
* exceed 16. Any requests beyond 16 will be ignored.
*
* Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
* SGPR user data registers enabled up to 16).
*/
/* Enable the setup of the SGPR user data registers
* (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
* for initial register state.
*
* The total number of SGPRuser data registers requested must not
* exceed 16. Any requests beyond 16 will be ignored.
*
* Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
* SGPR user data registers enabled up to 16).
*/
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z =
((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1)
<< AMD_CODE_PROPERTY_RESERVED1_SHIFT,
/* Control wave ID base counter for GDS ordered-append. Used to set
* COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
* ORDERED_APPEND_MODE also needs to be settable)
*/
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
/* Control wave ID base counter for GDS ordered-append. Used to set
* COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
* ORDERED_APPEND_MODE also needs to be settable)
*/
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS =
((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1)
<< AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
/* The interleave (swizzle) element size in bytes required by the
* code for private memory. This must be 2, 4, 8 or 16. This value
* is provided to the finalizer when it is invoked and is recorded
* here. The hardware will interleave the memory requests of each
* lane of a wavefront by this element size to ensure each
* work-item gets a distinct memory memory location. Therefore, the
* finalizer ensures that all load and store operations done to
* private memory do not exceed this size. For example, if the
* element size is 4 (32-bits or dword) and a 64-bit value must be
* loaded, the finalizer will generate two 32-bit loads. This
* ensures that the interleaving will get the work-item
* specific dword for both halves of the 64-bit value. If it just
* did a 64-bit load then it would get one dword which belonged to
* its own work-item, but the second dword would belong to the
* adjacent lane work-item since the interleaving is in dwords.
*
* The value used must match the value that the runtime configures
* the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
* is generally DWORD.
*
* USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
*/
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
/* The interleave (swizzle) element size in bytes required by the
* code for private memory. This must be 2, 4, 8 or 16. This value
* is provided to the finalizer when it is invoked and is recorded
* here. The hardware will interleave the memory requests of each
* lane of a wavefront by this element size to ensure each
* work-item gets a distinct memory memory location. Therefore, the
* finalizer ensures that all load and store operations done to
* private memory do not exceed this size. For example, if the
* element size is 4 (32-bits or dword) and a 64-bit value must be
* loaded, the finalizer will generate two 32-bit loads. This
* ensures that the interleaving will get the work-item
* specific dword for both halves of the 64-bit value. If it just
* did a 64-bit load then it would get one dword which belonged to
* its own work-item, but the second dword would belong to the
* adjacent lane work-item since the interleaving is in dwords.
*
* The value used must match the value that the runtime configures
* the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
* is generally DWORD.
*
* USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
*/
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE =
((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1)
<< AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
/* Are global memory addresses 64 bits. Must match
* amd_kernel_code_t.hsail_machine_model ==
* HSA_MACHINE_LARGE. Must also match
* SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
* SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
*/
AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
/* Are global memory addresses 64 bits. Must match
* amd_kernel_code_t.hsail_machine_model ==
* HSA_MACHINE_LARGE. Must also match
* SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
* SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
*/
AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1)
<< AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
/* Indicate if the generated ISA is using a dynamically sized call
* stack. This can happen if calls are implemented using a call
* stack and recursion, alloca or calls to indirect functions are
* present. In these cases the Finalizer cannot compute the total
* private segment size at compile time. In this case the
* workitem_private_segment_byte_size only specifies the statically
* know private segment size, and additional space must be added
* for the call stack.
*/
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
/* Indicate if the generated ISA is using a dynamically sized call
* stack. This can happen if calls are implemented using a call
* stack and recursion, alloca or calls to indirect functions are
* present. In these cases the Finalizer cannot compute the total
* private segment size at compile time. In this case the
* workitem_private_segment_byte_size only specifies the statically
* know private segment size, and additional space must be added
* for the call stack.
*/
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK =
((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1)
<< AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
/* Indicate if code generated has support for debugging. */
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
/* Indicate if code generated has support for debugging. */
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1)
<< AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1)
<< AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1)
<< AMD_CODE_PROPERTY_RESERVED2_SHIFT
};
/* AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
@ -381,154 +412,154 @@ enum amd_code_property_mask_t {
*/
typedef struct amd_kernel_code_s {
uint32_t amd_kernel_code_version_major;
uint32_t amd_kernel_code_version_minor;
uint16_t amd_machine_kind;
uint16_t amd_machine_version_major;
uint16_t amd_machine_version_minor;
uint16_t amd_machine_version_stepping;
uint32_t amd_kernel_code_version_major;
uint32_t amd_kernel_code_version_minor;
uint16_t amd_machine_kind;
uint16_t amd_machine_version_major;
uint16_t amd_machine_version_minor;
uint16_t amd_machine_version_stepping;
/* Byte offset (possibly negative) from start of amd_kernel_code_t
* object to kernel's entry point instruction. The actual code for
* the kernel is required to be 256 byte aligned to match hardware
* requirements (SQ cache line is 16). The code must be position
* independent code (PIC) for AMD devices to give runtime the
* option of copying code to discrete GPU memory or APU L2
* cache. The Finalizer should endeavour to allocate all kernel
* machine code in contiguous memory pages so that a device
* pre-fetcher will tend to only pre-fetch Kernel Code objects,
* improving cache performance.
*/
int64_t kernel_code_entry_byte_offset;
/* Byte offset (possibly negative) from start of amd_kernel_code_t
* object to kernel's entry point instruction. The actual code for
* the kernel is required to be 256 byte aligned to match hardware
* requirements (SQ cache line is 16). The code must be position
* independent code (PIC) for AMD devices to give runtime the
* option of copying code to discrete GPU memory or APU L2
* cache. The Finalizer should endeavour to allocate all kernel
* machine code in contiguous memory pages so that a device
* pre-fetcher will tend to only pre-fetch Kernel Code objects,
* improving cache performance.
*/
int64_t kernel_code_entry_byte_offset;
/* Range of bytes to consider prefetching expressed as an offset
* and size. The offset is from the start (possibly negative) of
* amd_kernel_code_t object. Set both to 0 if no prefetch
* information is available.
*/
int64_t kernel_code_prefetch_byte_offset;
uint64_t kernel_code_prefetch_byte_size;
/* Range of bytes to consider prefetching expressed as an offset
* and size. The offset is from the start (possibly negative) of
* amd_kernel_code_t object. Set both to 0 if no prefetch
* information is available.
*/
int64_t kernel_code_prefetch_byte_offset;
uint64_t kernel_code_prefetch_byte_size;
/* Number of bytes of scratch backing memory required for full
* occupancy of target chip. This takes into account the number of
* bytes of scratch per work-item, the wavefront size, the maximum
* number of wavefronts per CU, and the number of CUs. This is an
* upper limit on scratch. If the grid being dispatched is small it
* may only need less than this. If the kernel uses no scratch, or
* the Finalizer has not computed this value, it must be 0.
*/
uint64_t max_scratch_backing_memory_byte_size;
/* Number of bytes of scratch backing memory required for full
* occupancy of target chip. This takes into account the number of
* bytes of scratch per work-item, the wavefront size, the maximum
* number of wavefronts per CU, and the number of CUs. This is an
* upper limit on scratch. If the grid being dispatched is small it
* may only need less than this. If the kernel uses no scratch, or
* the Finalizer has not computed this value, it must be 0.
*/
uint64_t max_scratch_backing_memory_byte_size;
/* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
* COMPUTE_PGM_RSRC2 registers.
*/
uint64_t compute_pgm_resource_registers;
/* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
* COMPUTE_PGM_RSRC2 registers.
*/
uint64_t compute_pgm_resource_registers;
/* Code properties. See amd_code_property_mask_t for a full list of
* properties.
*/
uint32_t code_properties;
/* Code properties. See amd_code_property_mask_t for a full list of
* properties.
*/
uint32_t code_properties;
/* The amount of memory required for the combined private, spill
* and arg segments for a work-item in bytes. If
* is_dynamic_callstack is 1 then additional space must be added to
* this value for the call stack.
*/
uint32_t workitem_private_segment_byte_size;
/* The amount of memory required for the combined private, spill
* and arg segments for a work-item in bytes. If
* is_dynamic_callstack is 1 then additional space must be added to
* this value for the call stack.
*/
uint32_t workitem_private_segment_byte_size;
/* The amount of group segment memory required by a work-group in
* bytes. This does not include any dynamically allocated group
* segment memory that may be added when the kernel is
* dispatched.
*/
uint32_t workgroup_group_segment_byte_size;
/* The amount of group segment memory required by a work-group in
* bytes. This does not include any dynamically allocated group
* segment memory that may be added when the kernel is
* dispatched.
*/
uint32_t workgroup_group_segment_byte_size;
/* Number of byte of GDS required by kernel dispatch. Must be 0 if
* not using GDS.
*/
uint32_t gds_segment_byte_size;
/* Number of byte of GDS required by kernel dispatch. Must be 0 if
* not using GDS.
*/
uint32_t gds_segment_byte_size;
/* The size in bytes of the kernarg segment that holds the values
* of the arguments to the kernel. This could be used by CP to
* prefetch the kernarg segment pointed to by the dispatch packet.
*/
uint64_t kernarg_segment_byte_size;
/* The size in bytes of the kernarg segment that holds the values
* of the arguments to the kernel. This could be used by CP to
* prefetch the kernarg segment pointed to by the dispatch packet.
*/
uint64_t kernarg_segment_byte_size;
/* Number of fbarrier's used in the kernel and all functions it
* calls. If the implementation uses group memory to allocate the
* fbarriers then that amount must already be included in the
* workgroup_group_segment_byte_size total.
*/
uint32_t workgroup_fbarrier_count;
/* Number of fbarrier's used in the kernel and all functions it
* calls. If the implementation uses group memory to allocate the
* fbarriers then that amount must already be included in the
* workgroup_group_segment_byte_size total.
*/
uint32_t workgroup_fbarrier_count;
/* Number of scalar registers used by a wavefront. This includes
* the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
* and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
* trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
*/
uint16_t wavefront_sgpr_count;
/* Number of scalar registers used by a wavefront. This includes
* the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
* and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
* trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
*/
uint16_t wavefront_sgpr_count;
/* Number of vector registers used by each work-item. Used to set
* COMPUTE_PGM_RSRC1.VGPRS.
*/
uint16_t workitem_vgpr_count;
/* Number of vector registers used by each work-item. Used to set
* COMPUTE_PGM_RSRC1.VGPRS.
*/
uint16_t workitem_vgpr_count;
/* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
* first fixed VGPR number reserved.
*/
uint16_t reserved_vgpr_first;
/* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
* first fixed VGPR number reserved.
*/
uint16_t reserved_vgpr_first;
/* The number of consecutive VGPRs reserved by the client. If
* is_debug_supported then this count includes VGPRs reserved
* for debugger use.
*/
uint16_t reserved_vgpr_count;
/* The number of consecutive VGPRs reserved by the client. If
* is_debug_supported then this count includes VGPRs reserved
* for debugger use.
*/
uint16_t reserved_vgpr_count;
/* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
* first fixed SGPR number reserved.
*/
uint16_t reserved_sgpr_first;
/* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
* first fixed SGPR number reserved.
*/
uint16_t reserved_sgpr_first;
/* The number of consecutive SGPRs reserved by the client. If
* is_debug_supported then this count includes SGPRs reserved
* for debugger use.
*/
uint16_t reserved_sgpr_count;
/* The number of consecutive SGPRs reserved by the client. If
* is_debug_supported then this count includes SGPRs reserved
* for debugger use.
*/
uint16_t reserved_sgpr_count;
/* If is_debug_supported is 0 then must be 0. Otherwise, this is the
* fixed SGPR number used to hold the wave scratch offset for the
* entire kernel execution, or uint16_t(-1) if the register is not
* used or not known.
*/
uint16_t debug_wavefront_private_segment_offset_sgpr;
/* If is_debug_supported is 0 then must be 0. Otherwise, this is the
* fixed SGPR number used to hold the wave scratch offset for the
* entire kernel execution, or uint16_t(-1) if the register is not
* used or not known.
*/
uint16_t debug_wavefront_private_segment_offset_sgpr;
/* If is_debug_supported is 0 then must be 0. Otherwise, this is the
* fixed SGPR number of the first of 4 SGPRs used to hold the
* scratch V# used for the entire kernel execution, or uint16_t(-1)
* if the registers are not used or not known.
*/
uint16_t debug_private_segment_buffer_sgpr;
/* If is_debug_supported is 0 then must be 0. Otherwise, this is the
* fixed SGPR number of the first of 4 SGPRs used to hold the
* scratch V# used for the entire kernel execution, or uint16_t(-1)
* if the registers are not used or not known.
*/
uint16_t debug_private_segment_buffer_sgpr;
/* The maximum byte alignment of variables used by the kernel in
* the specified memory segment. Expressed as a power of two. Must
* be at least HSA_POWERTWO_16.
*/
uint8_t kernarg_segment_alignment;
uint8_t group_segment_alignment;
uint8_t private_segment_alignment;
/* The maximum byte alignment of variables used by the kernel in
* the specified memory segment. Expressed as a power of two. Must
* be at least HSA_POWERTWO_16.
*/
uint8_t kernarg_segment_alignment;
uint8_t group_segment_alignment;
uint8_t private_segment_alignment;
/* Wavefront size expressed as a power of two. Must be a power of 2
* in range 1..64 inclusive. Used to support runtime query that
* obtains wavefront size, which may be used by application to
* allocated dynamic group memory and set the dispatch work-group
* size.
*/
uint8_t wavefront_size;
/* Wavefront size expressed as a power of two. Must be a power of 2
* in range 1..64 inclusive. Used to support runtime query that
* obtains wavefront size, which may be used by application to
* allocated dynamic group memory and set the dispatch work-group
* size.
*/
uint8_t wavefront_size;
int32_t call_convention;
uint8_t reserved3[12];
uint64_t runtime_loader_kernel_symbol;
uint64_t control_directives[16];
int32_t call_convention;
uint8_t reserved3[12];
uint64_t runtime_loader_kernel_symbol;
uint64_t control_directives[16];
} amd_kernel_code_t;
#endif // AMDKERNELCODET_H

View file

@ -27,16 +27,17 @@
#ifndef GFX10_FORMAT_TABLE_H
#define GFX10_FORMAT_TABLE_H
#include <stdbool.h>
#include "pipe/p_format.h"
struct gfx10_format {
unsigned img_format:9;
#include <stdbool.h>
/* Various formats are only supported with workarounds for vertex fetch,
* and some 32_32_32 formats are supported natively, but only for buffers
* (possibly with some image support, actually, but no filtering). */
bool buffers_only:1;
struct gfx10_format {
unsigned img_format : 9;
/* Various formats are only supported with workarounds for vertex fetch,
* and some 32_32_32 formats are supported natively, but only for buffers
* (possibly with some image support, actually, but no filtering). */
bool buffers_only : 1;
};
extern const struct gfx10_format gfx10_format_table[PIPE_FORMAT_COUNT];

View file

@ -27,227 +27,227 @@
#include "amdgfxregs.h"
/* si values */
#define SI_CONFIG_REG_OFFSET 0x00008000
#define SI_CONFIG_REG_END 0x0000B000
#define SI_SH_REG_OFFSET 0x0000B000
#define SI_SH_REG_END 0x0000C000
#define SI_CONTEXT_REG_OFFSET 0x00028000
#define SI_CONTEXT_REG_END 0x00030000
#define CIK_UCONFIG_REG_OFFSET 0x00030000
#define CIK_UCONFIG_REG_END 0x00040000
#define SI_UCONFIG_PERF_REG_OFFSET 0x00034000
#define SI_UCONFIG_PERF_REG_END 0x00038000
#define SI_CONFIG_REG_OFFSET 0x00008000
#define SI_CONFIG_REG_END 0x0000B000
#define SI_SH_REG_OFFSET 0x0000B000
#define SI_SH_REG_END 0x0000C000
#define SI_CONTEXT_REG_OFFSET 0x00028000
#define SI_CONTEXT_REG_END 0x00030000
#define CIK_UCONFIG_REG_OFFSET 0x00030000
#define CIK_UCONFIG_REG_END 0x00040000
#define SI_UCONFIG_PERF_REG_OFFSET 0x00034000
#define SI_UCONFIG_PERF_REG_END 0x00038000
/* For register shadowing: */
#define SI_SH_REG_SPACE_SIZE (SI_SH_REG_END - SI_SH_REG_OFFSET)
#define SI_CONTEXT_REG_SPACE_SIZE (SI_CONTEXT_REG_END - SI_CONTEXT_REG_OFFSET)
#define SI_UCONFIG_REG_SPACE_SIZE (CIK_UCONFIG_REG_END - CIK_UCONFIG_REG_OFFSET)
#define SI_UCONFIG_PERF_REG_SPACE_SIZE (SI_UCONFIG_PERF_REG_END - SI_UCONFIG_PERF_REG_OFFSET)
#define SI_SH_REG_SPACE_SIZE (SI_SH_REG_END - SI_SH_REG_OFFSET)
#define SI_CONTEXT_REG_SPACE_SIZE (SI_CONTEXT_REG_END - SI_CONTEXT_REG_OFFSET)
#define SI_UCONFIG_REG_SPACE_SIZE (CIK_UCONFIG_REG_END - CIK_UCONFIG_REG_OFFSET)
#define SI_UCONFIG_PERF_REG_SPACE_SIZE (SI_UCONFIG_PERF_REG_END - SI_UCONFIG_PERF_REG_OFFSET)
#define SI_SHADOWED_SH_REG_OFFSET 0
#define SI_SHADOWED_CONTEXT_REG_OFFSET SI_SH_REG_SPACE_SIZE
#define SI_SHADOWED_UCONFIG_REG_OFFSET (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE)
#define SI_SHADOWED_REG_BUFFER_SIZE (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE + \
SI_UCONFIG_REG_SPACE_SIZE)
#define SI_SHADOWED_SH_REG_OFFSET 0
#define SI_SHADOWED_CONTEXT_REG_OFFSET SI_SH_REG_SPACE_SIZE
#define SI_SHADOWED_UCONFIG_REG_OFFSET (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE)
#define SI_SHADOWED_REG_BUFFER_SIZE \
(SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE + SI_UCONFIG_REG_SPACE_SIZE)
#define EVENT_TYPE_CACHE_FLUSH 0x6
#define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10
#define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10
#define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
#define EVENT_TYPE_ZPASS_DONE 0x15
#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16
#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f
#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20
#define EVENT_TYPE(x) ((x) << 0)
#define EVENT_INDEX(x) ((x) << 8)
/* 0 - any non-TS event
* 1 - ZPASS_DONE
* 2 - SAMPLE_PIPELINESTAT
* 3 - SAMPLE_STREAMOUTSTAT*
* 4 - *S_PARTIAL_FLUSH
* 5 - TS events
*/
#define EVENT_TYPE_ZPASS_DONE 0x15
#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16
#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f
#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20
#define EVENT_TYPE(x) ((x) << 0)
#define EVENT_INDEX(x) ((x) << 8)
/* 0 - any non-TS event
* 1 - ZPASS_DONE
* 2 - SAMPLE_PIPELINESTAT
* 3 - SAMPLE_STREAMOUTSTAT*
* 4 - *S_PARTIAL_FLUSH
* 5 - TS events
*/
/* EVENT_WRITE_EOP (SI-VI) & RELEASE_MEM (GFX9) */
#define EVENT_TCL1_VOL_ACTION_ENA (1 << 12)
#define EVENT_TC_VOL_ACTION_ENA (1 << 13)
#define EVENT_TC_WB_ACTION_ENA (1 << 15)
#define EVENT_TCL1_ACTION_ENA (1 << 16)
#define EVENT_TC_ACTION_ENA (1 << 17)
#define EVENT_TC_NC_ACTION_ENA (1 << 19) /* GFX9+ */
#define EVENT_TC_WC_ACTION_ENA (1 << 20) /* GFX9+ */
#define EVENT_TC_MD_ACTION_ENA (1 << 21) /* GFX9+ */
#define EVENT_TCL1_VOL_ACTION_ENA (1 << 12)
#define EVENT_TC_VOL_ACTION_ENA (1 << 13)
#define EVENT_TC_WB_ACTION_ENA (1 << 15)
#define EVENT_TCL1_ACTION_ENA (1 << 16)
#define EVENT_TC_ACTION_ENA (1 << 17)
#define EVENT_TC_NC_ACTION_ENA (1 << 19) /* GFX9+ */
#define EVENT_TC_WC_ACTION_ENA (1 << 20) /* GFX9+ */
#define EVENT_TC_MD_ACTION_ENA (1 << 21) /* GFX9+ */
#define PREDICATION_OP_CLEAR 0x0
#define PREDICATION_OP_ZPASS 0x1
#define PREDICATION_OP_CLEAR 0x0
#define PREDICATION_OP_ZPASS 0x1
#define PREDICATION_OP_PRIMCOUNT 0x2
#define PREDICATION_OP_BOOL64 0x3
#define PREDICATION_OP_BOOL64 0x3
#define PRED_OP(x) ((x) << 16)
#define PREDICATION_CONTINUE (1 << 31)
#define PREDICATION_HINT_WAIT (0 << 12)
#define PREDICATION_HINT_WAIT (0 << 12)
#define PREDICATION_HINT_NOWAIT_DRAW (1 << 12)
#define PREDICATION_DRAW_NOT_VISIBLE (0 << 8)
#define PREDICATION_DRAW_VISIBLE (1 << 8)
#define PREDICATION_DRAW_VISIBLE (1 << 8)
#define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7
#define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7
/* All registers defined in this packet section don't exist and the only
* purpose of these definitions is to define packet encoding that
* the IB parser understands, and also to have an accurate documentation.
*/
#define PKT3_NOP 0x10
#define PKT3_SET_BASE 0x11
#define PKT3_CLEAR_STATE 0x12
#define PKT3_INDEX_BUFFER_SIZE 0x13
#define PKT3_DISPATCH_DIRECT 0x15
#define PKT3_DISPATCH_INDIRECT 0x16
#define PKT3_OCCLUSION_QUERY 0x1F /* new for CIK */
#define PKT3_SET_PREDICATION 0x20
#define PKT3_COND_EXEC 0x22
#define PKT3_PRED_EXEC 0x23
#define PKT3_DRAW_INDIRECT 0x24
#define PKT3_DRAW_INDEX_INDIRECT 0x25
#define PKT3_INDEX_BASE 0x26
#define PKT3_DRAW_INDEX_2 0x27
#define PKT3_CONTEXT_CONTROL 0x28
#define CC0_LOAD_GLOBAL_CONFIG(x) (((unsigned)(x) & 0x1) << 0)
#define CC0_LOAD_PER_CONTEXT_STATE(x) (((unsigned)(x) & 0x1) << 1)
#define CC0_LOAD_GLOBAL_UCONFIG(x) (((unsigned)(x) & 0x1) << 15)
#define CC0_LOAD_GFX_SH_REGS(x) (((unsigned)(x) & 0x1) << 16)
#define CC0_LOAD_CS_SH_REGS(x) (((unsigned)(x) & 0x1) << 24)
#define CC0_LOAD_CE_RAM(x) (((unsigned)(x) & 0x1) << 28)
#define CC0_UPDATE_LOAD_ENABLES(x) (((unsigned)(x) & 0x1) << 31)
#define CC1_SHADOW_GLOBAL_CONFIG(x) (((unsigned)(x) & 0x1) << 0)
#define CC1_SHADOW_PER_CONTEXT_STATE(x) (((unsigned)(x) & 0x1) << 1)
#define CC1_SHADOW_GLOBAL_UCONFIG(x) (((unsigned)(x) & 0x1) << 15)
#define CC1_SHADOW_GFX_SH_REGS(x) (((unsigned)(x) & 0x1) << 16)
#define CC1_SHADOW_CS_SH_REGS(x) (((unsigned)(x) & 0x1) << 24)
#define CC1_UPDATE_SHADOW_ENABLES(x) (((unsigned)(x) & 0x1) << 31)
#define PKT3_INDEX_TYPE 0x2A /* not on GFX9 */
#define PKT3_DRAW_INDIRECT_MULTI 0x2C
#define R_2C3_DRAW_INDEX_LOC 0x2C3
#define S_2C3_COUNT_INDIRECT_ENABLE(x) (((unsigned)(x) & 0x1) << 30)
#define S_2C3_DRAW_INDEX_ENABLE(x) (((unsigned)(x) & 0x1) << 31)
#define PKT3_DRAW_INDEX_AUTO 0x2D
#define PKT3_DRAW_INDEX_IMMD 0x2E /* not on CIK */
#define PKT3_NUM_INSTANCES 0x2F
#define PKT3_DRAW_INDEX_MULTI_AUTO 0x30
#define PKT3_INDIRECT_BUFFER_SI 0x32 /* not on CIK */
#define PKT3_INDIRECT_BUFFER_CONST 0x33
#define PKT3_STRMOUT_BUFFER_UPDATE 0x34
#define STRMOUT_STORE_BUFFER_FILLED_SIZE 1
#define STRMOUT_OFFSET_SOURCE(x) (((unsigned)(x) & 0x3) << 1)
#define STRMOUT_OFFSET_FROM_PACKET 0
#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1
#define STRMOUT_OFFSET_FROM_MEM 2
#define STRMOUT_OFFSET_NONE 3
#define STRMOUT_DATA_TYPE(x) (((unsigned)(x) & 0x1) << 7)
#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8)
#define PKT3_DRAW_INDEX_OFFSET_2 0x35
#define PKT3_WRITE_DATA 0x37
#define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38
#define PKT3_MEM_SEMAPHORE 0x39
#define PKT3_MPEG_INDEX 0x3A /* not on CIK */
#define PKT3_WAIT_REG_MEM 0x3C
#define WAIT_REG_MEM_EQUAL 3
#define WAIT_REG_MEM_NOT_EQUAL 4
#define WAIT_REG_MEM_GREATER_OR_EQUAL 5
#define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x) & 0x3) << 4)
#define WAIT_REG_MEM_PFP (1 << 8)
#define PKT3_MEM_WRITE 0x3D /* not on CIK */
#define PKT3_INDIRECT_BUFFER_CIK 0x3F /* new on CIK */
#define PKT3_NOP 0x10
#define PKT3_SET_BASE 0x11
#define PKT3_CLEAR_STATE 0x12
#define PKT3_INDEX_BUFFER_SIZE 0x13
#define PKT3_DISPATCH_DIRECT 0x15
#define PKT3_DISPATCH_INDIRECT 0x16
#define PKT3_OCCLUSION_QUERY 0x1F /* new for CIK */
#define PKT3_SET_PREDICATION 0x20
#define PKT3_COND_EXEC 0x22
#define PKT3_PRED_EXEC 0x23
#define PKT3_DRAW_INDIRECT 0x24
#define PKT3_DRAW_INDEX_INDIRECT 0x25
#define PKT3_INDEX_BASE 0x26
#define PKT3_DRAW_INDEX_2 0x27
#define PKT3_CONTEXT_CONTROL 0x28
#define CC0_LOAD_GLOBAL_CONFIG(x) (((unsigned)(x)&0x1) << 0)
#define CC0_LOAD_PER_CONTEXT_STATE(x) (((unsigned)(x)&0x1) << 1)
#define CC0_LOAD_GLOBAL_UCONFIG(x) (((unsigned)(x)&0x1) << 15)
#define CC0_LOAD_GFX_SH_REGS(x) (((unsigned)(x)&0x1) << 16)
#define CC0_LOAD_CS_SH_REGS(x) (((unsigned)(x)&0x1) << 24)
#define CC0_LOAD_CE_RAM(x) (((unsigned)(x)&0x1) << 28)
#define CC0_UPDATE_LOAD_ENABLES(x) (((unsigned)(x)&0x1) << 31)
#define CC1_SHADOW_GLOBAL_CONFIG(x) (((unsigned)(x)&0x1) << 0)
#define CC1_SHADOW_PER_CONTEXT_STATE(x) (((unsigned)(x)&0x1) << 1)
#define CC1_SHADOW_GLOBAL_UCONFIG(x) (((unsigned)(x)&0x1) << 15)
#define CC1_SHADOW_GFX_SH_REGS(x) (((unsigned)(x)&0x1) << 16)
#define CC1_SHADOW_CS_SH_REGS(x) (((unsigned)(x)&0x1) << 24)
#define CC1_UPDATE_SHADOW_ENABLES(x) (((unsigned)(x)&0x1) << 31)
#define PKT3_INDEX_TYPE 0x2A /* not on GFX9 */
#define PKT3_DRAW_INDIRECT_MULTI 0x2C
#define R_2C3_DRAW_INDEX_LOC 0x2C3
#define S_2C3_COUNT_INDIRECT_ENABLE(x) (((unsigned)(x)&0x1) << 30)
#define S_2C3_DRAW_INDEX_ENABLE(x) (((unsigned)(x)&0x1) << 31)
#define PKT3_DRAW_INDEX_AUTO 0x2D
#define PKT3_DRAW_INDEX_IMMD 0x2E /* not on CIK */
#define PKT3_NUM_INSTANCES 0x2F
#define PKT3_DRAW_INDEX_MULTI_AUTO 0x30
#define PKT3_INDIRECT_BUFFER_SI 0x32 /* not on CIK */
#define PKT3_INDIRECT_BUFFER_CONST 0x33
#define PKT3_STRMOUT_BUFFER_UPDATE 0x34
#define STRMOUT_STORE_BUFFER_FILLED_SIZE 1
#define STRMOUT_OFFSET_SOURCE(x) (((unsigned)(x)&0x3) << 1)
#define STRMOUT_OFFSET_FROM_PACKET 0
#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1
#define STRMOUT_OFFSET_FROM_MEM 2
#define STRMOUT_OFFSET_NONE 3
#define STRMOUT_DATA_TYPE(x) (((unsigned)(x)&0x1) << 7)
#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x)&0x3) << 8)
#define PKT3_DRAW_INDEX_OFFSET_2 0x35
#define PKT3_WRITE_DATA 0x37
#define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38
#define PKT3_MEM_SEMAPHORE 0x39
#define PKT3_MPEG_INDEX 0x3A /* not on CIK */
#define PKT3_WAIT_REG_MEM 0x3C
#define WAIT_REG_MEM_EQUAL 3
#define WAIT_REG_MEM_NOT_EQUAL 4
#define WAIT_REG_MEM_GREATER_OR_EQUAL 5
#define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x)&0x3) << 4)
#define WAIT_REG_MEM_PFP (1 << 8)
#define PKT3_MEM_WRITE 0x3D /* not on CIK */
#define PKT3_INDIRECT_BUFFER_CIK 0x3F /* new on CIK */
#define PKT3_COPY_DATA 0x40
#define COPY_DATA_SRC_SEL(x) ((x) & 0xf)
#define COPY_DATA_REG 0
#define COPY_DATA_SRC_MEM 1 /* only valid as source */
#define COPY_DATA_TC_L2 2
#define COPY_DATA_GDS 3
#define COPY_DATA_PERF 4
#define COPY_DATA_IMM 5
#define COPY_DATA_TIMESTAMP 9
#define COPY_DATA_DST_SEL(x) (((unsigned)(x) & 0xf) << 8)
#define COPY_DATA_DST_MEM_GRBM 1 /* sync across GRBM, deprecated */
#define COPY_DATA_TC_L2 2
#define COPY_DATA_GDS 3
#define COPY_DATA_PERF 4
#define COPY_DATA_DST_MEM 5
#define COPY_DATA_COUNT_SEL (1 << 16)
#define COPY_DATA_WR_CONFIRM (1 << 20)
#define COPY_DATA_ENGINE_PFP (1 << 30)
#define PKT3_PFP_SYNC_ME 0x42
#define PKT3_COPY_DATA 0x40
#define COPY_DATA_SRC_SEL(x) ((x)&0xf)
#define COPY_DATA_REG 0
#define COPY_DATA_SRC_MEM 1 /* only valid as source */
#define COPY_DATA_TC_L2 2
#define COPY_DATA_GDS 3
#define COPY_DATA_PERF 4
#define COPY_DATA_IMM 5
#define COPY_DATA_TIMESTAMP 9
#define COPY_DATA_DST_SEL(x) (((unsigned)(x)&0xf) << 8)
#define COPY_DATA_DST_MEM_GRBM 1 /* sync across GRBM, deprecated */
#define COPY_DATA_TC_L2 2
#define COPY_DATA_GDS 3
#define COPY_DATA_PERF 4
#define COPY_DATA_DST_MEM 5
#define COPY_DATA_COUNT_SEL (1 << 16)
#define COPY_DATA_WR_CONFIRM (1 << 20)
#define COPY_DATA_ENGINE_PFP (1 << 30)
#define PKT3_PFP_SYNC_ME 0x42
#define PKT3_SURFACE_SYNC 0x43 /* deprecated on CIK, use ACQUIRE_MEM */
#define PKT3_ME_INITIALIZE 0x44 /* not on CIK */
#define PKT3_COND_WRITE 0x45
#define PKT3_EVENT_WRITE 0x46
#define PKT3_EVENT_WRITE_EOP 0x47 /* not on GFX9 */
#define EOP_DST_SEL(x) ((x) << 16)
#define EOP_DST_SEL_MEM 0
#define EOP_DST_SEL_TC_L2 1
#define EOP_INT_SEL(x) ((x) << 24)
#define EOP_INT_SEL_NONE 0
#define EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM 3
#define EOP_DATA_SEL(x) ((x) << 29)
#define EOP_DATA_SEL_DISCARD 0
#define EOP_DATA_SEL_VALUE_32BIT 1
#define EOP_DATA_SEL_VALUE_64BIT 2
#define EOP_DATA_SEL_TIMESTAMP 3
#define EOP_DATA_SEL_GDS 5
#define EOP_DATA_GDS(dw_offset, num_dwords) ((dw_offset) | ((unsigned)(num_dwords) << 16))
#define EOP_DST_SEL(x) ((x) << 16)
#define EOP_DST_SEL_MEM 0
#define EOP_DST_SEL_TC_L2 1
#define EOP_INT_SEL(x) ((x) << 24)
#define EOP_INT_SEL_NONE 0
#define EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM 3
#define EOP_DATA_SEL(x) ((x) << 29)
#define EOP_DATA_SEL_DISCARD 0
#define EOP_DATA_SEL_VALUE_32BIT 1
#define EOP_DATA_SEL_VALUE_64BIT 2
#define EOP_DATA_SEL_TIMESTAMP 3
#define EOP_DATA_SEL_GDS 5
#define EOP_DATA_GDS(dw_offset, num_dwords) ((dw_offset) | ((unsigned)(num_dwords) << 16))
/* CP DMA bug: Any use of CP_DMA.DST_SEL=TC must be avoided when EOS packets
* are used. Use DST_SEL=MC instead. For prefetch, use SRC_SEL=TC and
* DST_SEL=MC. Only CIK chips are affected.
*/
/* fix CP DMA before uncommenting: */
/*#define PKT3_EVENT_WRITE_EOS 0x48*/ /* not on GFX9 */
#define PKT3_RELEASE_MEM 0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */
#define PKT3_CONTEXT_REG_RMW 0x51 /* older firmware versions on older chips don't have this */
#define PKT3_ONE_REG_WRITE 0x57 /* not on CIK */
#define PKT3_ACQUIRE_MEM 0x58 /* new for CIK */
#define PKT3_REWIND 0x59 /* VI+ [any ring] or CIK [compute ring only] */
#define PKT3_LOAD_UCONFIG_REG 0x5E /* GFX7+ */
#define PKT3_LOAD_SH_REG 0x5F
#define PKT3_LOAD_CONTEXT_REG 0x61
#define PKT3_SET_CONFIG_REG 0x68
#define PKT3_SET_CONTEXT_REG 0x69
#define PKT3_SET_SH_REG 0x76
#define PKT3_SET_SH_REG_OFFSET 0x77
#define PKT3_SET_UCONFIG_REG 0x79 /* new for CIK */
#define PKT3_SET_UCONFIG_REG_INDEX 0x7A /* new for GFX9, CP ucode version >= 26 */
#define PKT3_LOAD_CONST_RAM 0x80
#define PKT3_WRITE_CONST_RAM 0x81
#define PKT3_DUMP_CONST_RAM 0x83
#define PKT3_INCREMENT_CE_COUNTER 0x84
#define PKT3_INCREMENT_DE_COUNTER 0x85
#define PKT3_WAIT_ON_CE_COUNTER 0x86
#define PKT3_SET_SH_REG_INDEX 0x9B
#define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* new for VI */
#define PKT3_RELEASE_MEM 0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */
#define PKT3_CONTEXT_REG_RMW 0x51 /* older firmware versions on older chips don't have this */
#define PKT3_ONE_REG_WRITE 0x57 /* not on CIK */
#define PKT3_ACQUIRE_MEM 0x58 /* new for CIK */
#define PKT3_REWIND 0x59 /* VI+ [any ring] or CIK [compute ring only] */
#define PKT3_LOAD_UCONFIG_REG 0x5E /* GFX7+ */
#define PKT3_LOAD_SH_REG 0x5F
#define PKT3_LOAD_CONTEXT_REG 0x61
#define PKT3_SET_CONFIG_REG 0x68
#define PKT3_SET_CONTEXT_REG 0x69
#define PKT3_SET_SH_REG 0x76
#define PKT3_SET_SH_REG_OFFSET 0x77
#define PKT3_SET_UCONFIG_REG 0x79 /* new for CIK */
#define PKT3_SET_UCONFIG_REG_INDEX 0x7A /* new for GFX9, CP ucode version >= 26 */
#define PKT3_LOAD_CONST_RAM 0x80
#define PKT3_WRITE_CONST_RAM 0x81
#define PKT3_DUMP_CONST_RAM 0x83
#define PKT3_INCREMENT_CE_COUNTER 0x84
#define PKT3_INCREMENT_DE_COUNTER 0x85
#define PKT3_WAIT_ON_CE_COUNTER 0x86
#define PKT3_SET_SH_REG_INDEX 0x9B
#define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* new for VI */
#define PKT_TYPE_S(x) (((unsigned)(x) & 0x3) << 30)
#define PKT_TYPE_G(x) (((x) >> 30) & 0x3)
#define PKT_TYPE_C 0x3FFFFFFF
#define PKT_COUNT_S(x) (((unsigned)(x) & 0x3FFF) << 16)
#define PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF)
#define PKT_COUNT_C 0xC000FFFF
#define PKT0_BASE_INDEX_S(x) (((unsigned)(x) & 0xFFFF) << 0)
#define PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF)
#define PKT0_BASE_INDEX_C 0xFFFF0000
#define PKT3_IT_OPCODE_S(x) (((unsigned)(x) & 0xFF) << 8)
#define PKT3_IT_OPCODE_G(x) (((x) >> 8) & 0xFF)
#define PKT3_IT_OPCODE_C 0xFFFF00FF
#define PKT3_PREDICATE(x) (((x) >> 0) & 0x1)
#define PKT3_SHADER_TYPE_S(x) (((unsigned)(x) & 0x1) << 1)
#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT_COUNT_S(count) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate))
#define PKT_TYPE_S(x) (((unsigned)(x)&0x3) << 30)
#define PKT_TYPE_G(x) (((x) >> 30) & 0x3)
#define PKT_TYPE_C 0x3FFFFFFF
#define PKT_COUNT_S(x) (((unsigned)(x)&0x3FFF) << 16)
#define PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF)
#define PKT_COUNT_C 0xC000FFFF
#define PKT0_BASE_INDEX_S(x) (((unsigned)(x)&0xFFFF) << 0)
#define PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF)
#define PKT0_BASE_INDEX_C 0xFFFF0000
#define PKT3_IT_OPCODE_S(x) (((unsigned)(x)&0xFF) << 8)
#define PKT3_IT_OPCODE_G(x) (((x) >> 8) & 0xFF)
#define PKT3_IT_OPCODE_C 0xFFFF00FF
#define PKT3_PREDICATE(x) (((x) >> 0) & 0x1)
#define PKT3_SHADER_TYPE_S(x) (((unsigned)(x)&0x1) << 1)
#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
#define PKT3(op, count, predicate) \
(PKT_TYPE_S(3) | PKT_COUNT_S(count) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate))
#define PKT2_NOP_PAD PKT_TYPE_S(2)
#define PKT3_NOP_PAD PKT3(PKT3_NOP, 0x3fff, 0) /* header-only version */
#define PKT2_NOP_PAD PKT_TYPE_S(2)
#define PKT3_NOP_PAD PKT3(PKT3_NOP, 0x3fff, 0) /* header-only version */
#define PKT3_CP_DMA 0x41
#define PKT3_CP_DMA 0x41
/* 1. header
* 2. SRC_ADDR_LO [31:0] or DATA [31:0]
* 3. CP_SYNC [31] | SRC_SEL [30:29] | ENGINE [27] | DST_SEL [21:20] | SRC_ADDR_HI [15:0]
@ -256,7 +256,7 @@
* 6. COMMAND [29:22] | BYTE_COUNT [20:0]
*/
#define PKT3_DMA_DATA 0x50 /* new for CIK */
#define PKT3_DMA_DATA 0x50 /* new for CIK */
/* 1. header
* 2. CP_SYNC [31] | SRC_SEL [30:29] | DST_SEL [21:20] | ENGINE [0]
* 2. SRC_ADDR_LO [31:0] or DATA [31:0]
@ -267,69 +267,70 @@
*/
/* SI async DMA packets */
#define SI_DMA_PACKET(cmd, sub_cmd, n) ((((unsigned)(cmd) & 0xF) << 28) | \
(((unsigned)(sub_cmd) & 0xFF) << 20) |\
(((unsigned)(n) & 0xFFFFF) << 0))
#define SI_DMA_PACKET(cmd, sub_cmd, n) \
((((unsigned)(cmd)&0xF) << 28) | (((unsigned)(sub_cmd)&0xFF) << 20) | \
(((unsigned)(n)&0xFFFFF) << 0))
/* SI async DMA Packet types */
#define SI_DMA_PACKET_WRITE 0x2
#define SI_DMA_PACKET_COPY 0x3
#define SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE 0xfffe0
#define SI_DMA_PACKET_WRITE 0x2
#define SI_DMA_PACKET_COPY 0x3
#define SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE 0xfffe0
/* The documentation says 0xffff8 is the maximum size in dwords, which is
* 0x3fffe0 in bytes. */
#define SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE 0x3fffe0
#define SI_DMA_COPY_DWORD_ALIGNED 0x00
#define SI_DMA_COPY_BYTE_ALIGNED 0x40
#define SI_DMA_COPY_TILED 0x8
#define SI_DMA_PACKET_INDIRECT_BUFFER 0x4
#define SI_DMA_PACKET_SEMAPHORE 0x5
#define SI_DMA_PACKET_FENCE 0x6
#define SI_DMA_PACKET_TRAP 0x7
#define SI_DMA_PACKET_SRBM_WRITE 0x9
#define SI_DMA_PACKET_CONSTANT_FILL 0xd
#define SI_DMA_PACKET_NOP 0xf
#define SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE 0x3fffe0
#define SI_DMA_COPY_DWORD_ALIGNED 0x00
#define SI_DMA_COPY_BYTE_ALIGNED 0x40
#define SI_DMA_COPY_TILED 0x8
#define SI_DMA_PACKET_INDIRECT_BUFFER 0x4
#define SI_DMA_PACKET_SEMAPHORE 0x5
#define SI_DMA_PACKET_FENCE 0x6
#define SI_DMA_PACKET_TRAP 0x7
#define SI_DMA_PACKET_SRBM_WRITE 0x9
#define SI_DMA_PACKET_CONSTANT_FILL 0xd
#define SI_DMA_PACKET_NOP 0xf
/* CIK async DMA packets */
#define CIK_SDMA_PACKET(op, sub_op, n) ((((unsigned)(n) & 0xFFFF) << 16) | \
(((unsigned)(sub_op) & 0xFF) << 8) | \
(((unsigned)(op) & 0xFF) << 0))
#define CIK_SDMA_PACKET(op, sub_op, n) \
((((unsigned)(n)&0xFFFF) << 16) | (((unsigned)(sub_op)&0xFF) << 8) | \
(((unsigned)(op)&0xFF) << 0))
/* CIK async DMA packet types */
#define CIK_SDMA_OPCODE_NOP 0x0
#define CIK_SDMA_OPCODE_COPY 0x1
#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR 0x0
#define CIK_SDMA_COPY_SUB_OPCODE_TILED 0x1
#define CIK_SDMA_COPY_SUB_OPCODE_SOA 0x3
#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4
#define CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW 0x5
#define CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW 0x6
#define CIK_SDMA_OPCODE_WRITE 0x2
#define SDMA_WRITE_SUB_OPCODE_LINEAR 0x0
#define SDMA_WRTIE_SUB_OPCODE_TILED 0x1
#define CIK_SDMA_OPCODE_INDIRECT_BUFFER 0x4
#define CIK_SDMA_PACKET_FENCE 0x5
#define CIK_SDMA_PACKET_TRAP 0x6
#define CIK_SDMA_PACKET_SEMAPHORE 0x7
#define CIK_SDMA_PACKET_CONSTANT_FILL 0xb
#define CIK_SDMA_OPCODE_TIMESTAMP 0xd
#define SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP 0x0
#define SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP 0x1
#define SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP 0x2
#define CIK_SDMA_PACKET_SRBM_WRITE 0xe
#define CIK_SDMA_OPCODE_NOP 0x0
#define CIK_SDMA_OPCODE_COPY 0x1
#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR 0x0
#define CIK_SDMA_COPY_SUB_OPCODE_TILED 0x1
#define CIK_SDMA_COPY_SUB_OPCODE_SOA 0x3
#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4
#define CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW 0x5
#define CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW 0x6
#define CIK_SDMA_OPCODE_WRITE 0x2
#define SDMA_WRITE_SUB_OPCODE_LINEAR 0x0
#define SDMA_WRTIE_SUB_OPCODE_TILED 0x1
#define CIK_SDMA_OPCODE_INDIRECT_BUFFER 0x4
#define CIK_SDMA_PACKET_FENCE 0x5
#define CIK_SDMA_PACKET_TRAP 0x6
#define CIK_SDMA_PACKET_SEMAPHORE 0x7
#define CIK_SDMA_PACKET_CONSTANT_FILL 0xb
#define CIK_SDMA_OPCODE_TIMESTAMP 0xd
#define SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP 0x0
#define SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP 0x1
#define SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP 0x2
#define CIK_SDMA_PACKET_SRBM_WRITE 0xe
/* There is apparently an undocumented HW limitation that
prevents the HW from copying the last 255 bytes of (1 << 22) - 1 */
#define CIK_SDMA_COPY_MAX_SIZE 0x3fff00 /* almost 4 MB*/
#define GFX103_SDMA_COPY_MAX_SIZE 0x3fffff00 /* almost 1 GB */
#define CIK_SDMA_COPY_MAX_SIZE 0x3fff00 /* almost 4 MB*/
#define GFX103_SDMA_COPY_MAX_SIZE 0x3fffff00 /* almost 1 GB */
enum amd_cmp_class_flags {
S_NAN = 1 << 0, // Signaling NaN
Q_NAN = 1 << 1, // Quiet NaN
N_INFINITY = 1 << 2, // Negative infinity
N_NORMAL = 1 << 3, // Negative normal
N_SUBNORMAL = 1 << 4, // Negative subnormal
N_ZERO = 1 << 5, // Negative zero
P_ZERO = 1 << 6, // Positive zero
P_SUBNORMAL = 1 << 7, // Positive subnormal
P_NORMAL = 1 << 8, // Positive normal
P_INFINITY = 1 << 9 // Positive infinity
enum amd_cmp_class_flags
{
S_NAN = 1 << 0, // Signaling NaN
Q_NAN = 1 << 1, // Quiet NaN
N_INFINITY = 1 << 2, // Negative infinity
N_NORMAL = 1 << 3, // Negative normal
N_SUBNORMAL = 1 << 4, // Negative subnormal
N_ZERO = 1 << 5, // Negative zero
P_ZERO = 1 << 6, // Positive zero
P_SUBNORMAL = 1 << 7, // Positive subnormal
P_NORMAL = 1 << 8, // Positive normal
P_INFINITY = 1 << 9 // Positive infinity
};
#endif /* _SID_H */