intel: Add executor tool

Add a tool that programs the hardware the minimum amount to be
able to execute compute shaders and then executes a script that
can perform data manipulation and dispatch execution of the shaders
(written in Xe assembly).

The goal is to have a tool to experiment directly with certain
assembly instructions and the shared units without having to
instrument the drivers.

To make more convenient to write assembly, a few macros (indicated by
the @-symbol) will be processed into the full instruction.

For example, the script

```
  local r = execute {
    data={ [42] = 0x100 },
    src=[[
      @mov     g1      42
      @read    g2      g1

      @id      g3

      add(8)   g4<1>UD  g2<8,8,1>UD  g3<8,8,1>UD  { align1 @1 1Q };

      @write   g3       g4
      @eot
    ]]
  }

  dump(r, 4)
```

produces

```
  [0x00000000] 0x00000100 0x00000101 0x00000102 0x00000103
```

There's a help message inside the code that describes the script
environment and the macros for assembly sources.

Acked-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30062>
This commit is contained in:
Caio Oliveira 2024-07-06 21:44:45 -07:00 committed by Marge Bot
parent 6267585778
commit e72bf2d02f
12 changed files with 1729 additions and 0 deletions

View file

@ -0,0 +1,41 @@
-- BFI seems available on Gfx9, need to fix the emission code for that.
check_verx10(110, 120, 125, 200)
function BFI_simulation(a, b, c, d)
local width = a & 0x1F
local offset = b & 0x1F
local mask = ((1 << width) - 1) << offset
return ((c << offset) & mask) | (d & ~mask)
end
function BFI(a, b, c, d)
local r = execute {
data = { [0] = a, b, c, d },
src = [[
@id g9
@mov g11 0
@mov g12 1
@mov g13 2
@mov g14 3
@read g1 g11
@read g2 g12
@read g3 g13
@read g4 g14
bfi1(8) g5<1>UD g1<8,8,1>UD g2<8,8,1>UD { align1 @1 1Q };
bfi2(8) g6<1>UD g5<8,8,1>UD g3<8,8,1>UD g4<8,8,1>UD { align1 @1 1Q };
@write g9 g6
@eot
]],
}
return r[0]
end
function Hex(v) return string.format("0x%08x", v) end
local a, b, c, d = 12, 12, 0xAAAAAAAA, 0xBBBBBBBB
print("calculated", Hex(BFI(a, b, c, d)))
print("expected", Hex(BFI_simulation(a, b, c, d)))

View file

@ -0,0 +1,41 @@
--[[
Execute the example from the Dot Product 4 Accumulate
instruction as seen in the PRM.
mov (1) r1.0:d 0x0102037F:d
// (char4)(0x1,0x2,0x3,0x7F)
mov (1) r2.0:d 50:d
dp4a (1) r3.0:d r2:d r1:d r1:d
// r3.0 = 50 + (0x1*0x1 + 0x2*0x2 + 0x3*0x3 + 0x7F*0x7F)
// = 50 + (1 + 4 + 9 + 16129)
// = 16193
--]]
check_ver(12)
function DP4A(a, b, c)
local r = c
for i = 1, 4 do
r = r + a[i] * b[i]
end
return r
end
local r = execute {
src = [[
@id g9
@mov g1 0x0102037F
@mov g2 50
dp4a(8) g3<1>UD g2<8,8,1>UD g1<8,8,1>UD g1<8,8,1>UD { align1 @1 1Q };
@write g9 g3
@eot
]],
}
print("expected", DP4A({1,2,3,0x7F}, {1,2,3,0x7F}, 50))
print("calculated", r[0])

View file

@ -0,0 +1,18 @@
-- Example from the help message.
local r = execute {
data={ [42] = 0x100 },
src=[[
@mov g1 42
@read g2 g1
@id g3
add(8) g4<1>UD g2<8,8,1>UD g3<8,8,1>UD { align1 @1 1Q };
@write g3 g4
@eot
]]
}
dump(r, 4)

View file

@ -0,0 +1,6 @@
execute {
src = [[
nop;
@eot
]],
}

View file

@ -0,0 +1,20 @@
local data = {}
for i = 0, 8-1 do
data[i] = i * 4
end
local r = execute {
data = data,
src = [[
@id g1
@read g3 g1
add(8) g3<1>UD g3<8,8,1>UD 0x100UD { align1 1Q };
@write g1 g3
@eot
]],
}
dump(r, 8)

View file

@ -0,0 +1,94 @@
/*
* Copyright © 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#ifndef EXECUTOR_H
#define EXECUTOR_H
#include <stdint.h>
#include "intel/dev/intel_device_info.h"
#include "intel/isl/isl.h"
typedef struct {
uint32_t size;
uint32_t handle;
void *map;
void *cursor;
uint64_t addr;
} executor_bo;
typedef struct {
void *mem_ctx;
struct intel_device_info *devinfo;
struct isl_device *isl_dev;
int fd;
struct {
uint32_t ctx_id;
} i915;
struct {
uint32_t vm_id;
uint32_t queue_id;
} xe;
struct {
executor_bo batch;
executor_bo extra;
executor_bo data;
} bo;
uint64_t batch_start;
} executor_context;
typedef struct {
const char *original_src;
void *kernel_bin;
uint32_t kernel_size;
} executor_params;
typedef struct {
uint64_t offset;
} executor_address;
__attribute__((unused)) static uint64_t
executor_combine_address(void *data, void *location,
executor_address address, uint32_t delta)
{
return address.offset + delta;
}
executor_address executor_address_of_ptr(executor_bo *bo, void *ptr);
void *executor_alloc_bytes(executor_bo *bo, uint32_t size);
void *executor_alloc_bytes_aligned(executor_bo *bo, uint32_t size, uint32_t alignment);
void failf(const char *fmt, ...) PRINTFLIKE(1, 2);
const char *executor_apply_macros(executor_context *ec, const char *original_src);
#ifdef genX
# include "executor_genx.h"
#else
# define genX(x) gfx9_##x
# include "executor_genx.h"
# undef genX
# define genX(x) gfx11_##x
# include "executor_genx.h"
# undef genX
# define genX(x) gfx12_##x
# include "executor_genx.h"
# undef genX
# define genX(x) gfx125_##x
# include "executor_genx.h"
# undef genX
# define genX(x) gfx20_##x
# include "executor_genx.h"
# undef genX
#endif
#endif /* EXECUTOR_H */

View file

@ -0,0 +1,183 @@
/*
* Copyright © 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "executor.h"
#ifdef HAVE_VALGRIND
#include <valgrind.h>
#include <memcheck.h>
#define VG(x) x
#else
#define VG(x) ((void)0)
#endif
#define __gen_address_type executor_address
#define __gen_combine_address executor_combine_address
#define __gen_user_data void
#include "intel/genxml/gen_macros.h"
#include "intel/genxml/genX_pack.h"
#define __executor_cmd_length(cmd) cmd ## _length
#define __executor_cmd_header(cmd) cmd ## _header
#define __executor_cmd_pack(cmd) cmd ## _pack
#define executor_batch_emit(cmd, name) \
for (struct cmd name = { __executor_cmd_header(cmd) }, \
*_dst = executor_alloc_bytes(&ec->bo.batch, __executor_cmd_length(cmd) * 4); \
__builtin_expect(_dst != NULL, 1); \
({ __executor_cmd_pack(cmd)(0, _dst, &name); \
VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __executor_cmd_length(cmd) * 4)); \
_dst = NULL; \
}))
static void
emit_pipe_control(executor_context *ec)
{
executor_batch_emit(GENX(PIPE_CONTROL), pc) {
#if GFX_VER >= 12
pc.HDCPipelineFlushEnable = true;
#endif
pc.PipeControlFlushEnable = true;
pc.CommandStreamerStallEnable = true;
}
}
static void
emit_state_base_address(executor_context *ec, uint32_t mocs)
{
/* Use the full address for everything. */
const executor_address base_address = {0};
const uint32_t size = (1 << 20) - 1;
executor_batch_emit(GENX(STATE_BASE_ADDRESS), sba) {
sba.GeneralStateBaseAddress = base_address;
sba.GeneralStateBaseAddressModifyEnable = true;
sba.GeneralStateBufferSize = size;
sba.GeneralStateBufferSizeModifyEnable = true;
sba.GeneralStateMOCS = mocs;
sba.DynamicStateBaseAddress = base_address;
sba.DynamicStateBaseAddressModifyEnable = true;
sba.DynamicStateBufferSize = size;
sba.DynamicStateBufferSizeModifyEnable = true;
sba.DynamicStateMOCS = mocs;
sba.InstructionBaseAddress = base_address;
sba.InstructionBaseAddressModifyEnable = true;
sba.InstructionBufferSize = size;
sba.InstructionBuffersizeModifyEnable = true;
sba.InstructionMOCS = mocs;
sba.IndirectObjectBaseAddress = base_address;
sba.IndirectObjectBaseAddressModifyEnable = true;
sba.IndirectObjectBufferSize = size;
sba.IndirectObjectBufferSizeModifyEnable = true;
sba.IndirectObjectMOCS = mocs;
sba.SurfaceStateMOCS = mocs;
sba.StatelessDataPortAccessMOCS = mocs;
#if GFX_VER >= 11
sba.BindlessSamplerStateMOCS = mocs;
#endif
sba.BindlessSurfaceStateMOCS = mocs;
#if GFX_VERx10 >= 125
sba.L1CacheControl = L1CC_WB;
#endif
};
}
void
genX(emit_execute)(executor_context *ec, const executor_params *params)
{
uint32_t *kernel = executor_alloc_bytes(&ec->bo.extra, params->kernel_size);
memcpy(kernel, params->kernel_bin, params->kernel_size);
executor_address kernel_addr = executor_address_of_ptr(&ec->bo.extra, kernel);
/* TODO: Let SIMD be a parameter. */
struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
.KernelStartPointer = kernel_addr.offset,
.NumberofThreadsinGPGPUThreadGroup = 1,
};
void *b = executor_alloc_bytes_aligned(&ec->bo.batch, 0, 256);
ec->batch_start = executor_address_of_ptr(&ec->bo.batch, b).offset;
emit_pipe_control(ec);
#if GFX_VERx10 < 200
executor_batch_emit(GENX(PIPELINE_SELECT), ps) {
ps.PipelineSelection = GPGPU;
ps.MaskBits = 0x3;
}
emit_pipe_control(ec);
#endif
const uint32_t mocs = isl_mocs(ec->isl_dev, 0, false);
emit_state_base_address(ec, mocs);
#if GFX_VERx10 >= 125
executor_batch_emit(GENX(STATE_COMPUTE_MODE), cm) {
cm.Mask1 = 0xffff;
#if GFX_VERx10 >= 200
cm.Mask2 = 0xffff;
#endif
}
executor_batch_emit(GENX(CFE_STATE), cfe) {
cfe.MaximumNumberofThreads = 64;
}
#else
executor_batch_emit(GENX(MEDIA_VFE_STATE), vfe) {
vfe.NumberofURBEntries = 2;
vfe.MaximumNumberofThreads = 64;
}
#endif
emit_pipe_control(ec);
#if GFX_VERx10 >= 125
executor_batch_emit(GENX(COMPUTE_WALKER), cw) {
#if GFX_VERx10 >= 200
cw.SIMDSize = 1;
cw.MessageSIMD = 1;
#endif
cw.ThreadGroupIDXDimension = 1;
cw.ThreadGroupIDYDimension = 1;
cw.ThreadGroupIDZDimension = 1;
cw.ExecutionMask = 0xFFFFFFFF;
cw.PostSync.MOCS = mocs;
cw.InterfaceDescriptor = desc;
};
#else
uint32_t *idd = executor_alloc_bytes_aligned(&ec->bo.extra, 8 * 4, 256);
GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, idd, &desc);
executor_address idd_addr = executor_address_of_ptr(&ec->bo.extra, idd);
executor_batch_emit(GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
load.InterfaceDescriptorDataStartAddress = idd_addr.offset,
load.InterfaceDescriptorTotalLength = 8 * 4;
}
executor_batch_emit(GENX(GPGPU_WALKER), gw) {
gw.ThreadGroupIDXDimension = 1;
gw.ThreadGroupIDYDimension = 1;
gw.ThreadGroupIDZDimension = 1;
gw.RightExecutionMask = 0xFFFFFFFF;
gw.BottomExecutionMask = 0xFFFFFFFF;
}
executor_batch_emit(GENX(MEDIA_STATE_FLUSH), msf);
#endif
emit_pipe_control(ec);
executor_batch_emit(GENX(MI_BATCH_BUFFER_END), end);
}

View file

@ -0,0 +1,10 @@
/*
* Copyright © 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#ifndef EXECUTOR_H
#error This file must be included via executor.h
#endif
void genX(emit_execute)(executor_context *ec, const executor_params *params);

View file

@ -0,0 +1,407 @@
/*
* Copyright © 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include <ctype.h>
#include "util/ralloc.h"
#include "intel/compiler/brw_asm.h"
#include "executor.h"
static bool
startswith(const char *prefix, const char *s)
{
return !strncmp(prefix, s, strlen(prefix));
}
static char *
skip_prefix(char *prefix, char *start)
{
assert(startswith(prefix, start));
char *c = start += strlen(prefix);
return c;
}
typedef struct {
char **args;
int count;
} parse_args_result;
static parse_args_result
parse_args(void *mem_ctx, char *c)
{
parse_args_result r = {0};
while (*c) {
/* Skip spaces. */
while (*c && isspace(*c))
c++;
if (!*c)
break;
/* Copy non-spaces. */
char *start = c;
while (*c && !isspace(*c))
c++;
r.args = reralloc_array_size(mem_ctx, r.args, sizeof(char *), r.count + 1);
r.args[r.count++] = ralloc_strndup(mem_ctx, start, c - start);
}
return r;
}
static void
executor_macro_mov(executor_context *ec, char **src, char *line)
{
char *c = skip_prefix("@mov", line);
parse_args_result r = parse_args(ec->mem_ctx, c);
if (r.count != 2)
failf("@mov needs 2 arguments, found %d\n", r.count);
const char *reg = r.args[0];
char *value = r.args[1];
if (strchr(value, '.')) {
union {
float f;
uint32_t u;
} val;
val.f = strtof(value, NULL);
switch (ec->devinfo->verx10) {
case 90:
case 110:
case 120:
case 125: {
ralloc_asprintf_append(src, "mov(8) %s<1>F 0x%08xF /* %f */ { align1 1Q };\n", reg, val.u, val.f);
break;
}
case 200: {
ralloc_asprintf_append(src, "mov(16) %s<1>F 0x%08xF /* %f */ { align1 1H };\n", reg, val.u, val.f);
break;
}
default:
unreachable("invalid gfx version");
}
} else {
for (char *c = value; *c; c++)
*c = tolower(*c);
switch (ec->devinfo->verx10) {
case 90:
case 110:
case 120:
case 125: {
ralloc_asprintf_append(src, "mov(8) %s<1>UD %sUD { align1 1Q };\n", reg, value);
break;
}
case 200: {
ralloc_asprintf_append(src, "mov(16) %s<1>UD %sUD { align1 1H };\n", reg, value);
break;
}
default:
unreachable("invalid gfx version");
}
}
}
static void
executor_macro_syncnop(executor_context *ec, char **src, char *line)
{
switch (ec->devinfo->verx10) {
case 90:
case 110: {
/* Not needed. */
break;
}
case 120: {
ralloc_strcat(src, "sync nop(8) null<0,1,0>UD { align1 WE_all 1H @1 $1 };\n");
break;
}
case 125:
case 200: {
ralloc_strcat(src, "sync nop(8) null<0,1,0>UD { align1 WE_all 1H A@1 $1 };\n");
break;
}
default:
unreachable("invalid gfx version");
}
}
static void
executor_macro_eot(executor_context *ec, char **src, char *line)
{
switch (ec->devinfo->verx10) {
case 90:
case 110: {
ralloc_strcat(src,
"mov(8) g127<1>UD g0<8;8,1>UD { align1 WE_all 1Q };\n"
"send(8) null<1>UW g127<0,1,0>UD 0x82000010\n"
" thread_spawner MsgDesc: mlen 1 rlen 0 { align1 WE_all 1Q EOT };\n");
break;
}
case 120: {
ralloc_strcat(src,
"mov(8) g127<1>UD g0<8;8,1>UD { align1 WE_all 1Q };\n"
"send(8) nullUD g127UD nullUD 0x02000000 0x00000000\n"
" thread_spawner MsgDesc: mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q @1 EOT };\n");
break;
}
case 125: {
ralloc_strcat(src,
"mov(8) g127<1>UD g0<8;8,1>UD { align1 WE_all 1Q };\n"
"send(8) nullUD g127UD nullUD 0x02000000 0x00000000\n"
" gateway MsgDesc: (open) mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q A@1 EOT };\n");
break;
}
case 200: {
ralloc_strcat(src,
"mov(16) g127<1>UD g0<1,1,0>UD { align1 WE_all 1H };\n"
"send(16) nullUD g127UD nullUD 0x02000000 0x00000000\n"
" gateway MsgDesc: (open) mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1H I@1 EOT };\n");
break;
}
default:
unreachable("invalid gfx version");
}
}
static void
executor_macro_id(executor_context *ec, char **src, char *line)
{
char *c = skip_prefix("@id", line);
parse_args_result r = parse_args(ec->mem_ctx, c);
if (r.count != 1)
failf("@id needs 1 argument, found %d\n", r.count);
const char *reg = r.args[0];
switch (ec->devinfo->verx10) {
case 90:
case 110:
case 120:
case 125: {
ralloc_asprintf_append(src,
"mov(8) g127<1>UW 0x76543210V { align1 WE_all 1Q };\n"
"mov(8) %s<1>UD g127<8,8,1>UW { align1 WE_all 1Q @1 };\n", reg);
break;
}
case 200: {
ralloc_asprintf_append(src,
"mov(8) g127<1>UW 0x76543210V { align1 WE_all 1Q };\n"
"add(8) g127.8<1>UW g127<1,1,0>UW 8UW { align1 WE_all 1Q @1 };\n"
"mov(16) %s<1>UD g127<8,8,1>UW { align1 WE_all 1Q @1 };\n", reg);
break;
}
default:
unreachable("invalid gfx version");
}
}
static void
executor_macro_write(executor_context *ec, char **src, char *line)
{
char *c = skip_prefix("@write", line);
parse_args_result r = parse_args(ec->mem_ctx, c);
if (r.count != 2)
failf("@write needs 2 arguments, found %d\n", r.count);
const char *offset_reg = r.args[0];
const char *data_reg = r.args[1];
assert(ec->bo.data.addr <= 0xFFFFFFFF);
uint32_t base_addr = ec->bo.data.addr;
switch (ec->devinfo->verx10) {
case 90:
case 110:
case 120: {
const char *send_suffix = ec->devinfo->verx10 < 120 ? "s" : "";
ralloc_asprintf_append(src,
"mul(8) g127<1>UD %s<8;8,1>UD 0x4UW { align1 @1 1Q };\n"
"add(8) g127<1>UD g127<8;8,1>UD 0x%08xUD { align1 @1 1Q };\n"
"send%s(8) nullUD g127UD %sUD 0x2026efd 0x00000040\n"
" dp data 1 MsgDesc: (DC untyped surface write, Surface = 253, "
" SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 "
" { align1 1Q @1 $1 };\n",
offset_reg, base_addr, send_suffix, data_reg);
executor_macro_syncnop(ec, src, "@syncnop");
break;
}
case 125: {
ralloc_asprintf_append(src,
"mul(8) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n"
"add(8) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n"
"send(8) nullUD g127UD %sUD 0x02000504 0x00000040\n"
" ugm MsgDesc: ( store, a32, d32, x, L1STATE_L3MOCS dst_len = 0, "
" src0_len = 1, src1_len = 1, flat ) base_offset 0 "
" { align1 1Q A@1 $1 };\n",
offset_reg, base_addr, data_reg);
executor_macro_syncnop(ec, src, "@syncnop");
break;
}
case 200: {
ralloc_asprintf_append(src,
"mul(16) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n"
"add(16) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n"
"send(16) nullUD g127UD %sUD 0x02000504 0x00000040\n"
" ugm MsgDesc: ( store, a32, d32, x, L1STATE_L3MOCS dst_len = 0, "
" src0_len = 1, src1_len = 1, flat ) base_offset 0 "
" { align1 1Q A@1 $1 };\n",
offset_reg, base_addr, data_reg);
executor_macro_syncnop(ec, src, "@syncnop");
break;
}
default:
unreachable("invalid gfx version");
}
}
static void
executor_macro_read(executor_context *ec, char **src, char *line)
{
char *c = skip_prefix("@read", line);
parse_args_result r = parse_args(ec->mem_ctx, c);
if (r.count != 2)
failf("@read needs 2 arguments, found %d\n", r.count);
/* Order follows underlying SEND, destination first. */
const char *data_reg = r.args[0];
const char *offset_reg = r.args[1];
assert(ec->bo.data.addr <= 0xFFFFFFFF);
uint32_t base_addr = ec->bo.data.addr;
switch (ec->devinfo->verx10) {
case 90:
case 110:
case 120: {
const char *send_suffix = ec->devinfo->verx10 < 120 ? "s" : "";
ralloc_asprintf_append(src,
"mul(8) g127<1>UD %s<8;8,1>UD 0x4UW { align1 @1 1Q };\n"
"add(8) g127<1>UD g127<8;8,1>UD 0x%08xUD { align1 @1 1Q };\n"
"send%s(8) %sUD g127UD nullUD 0x2106efd 0x00000000\n"
" dp data 1 MsgDesc: (DC untyped surface read, Surface = 253, "
" SIMD8, Mask = 0xe) mlen 1 ex_mlen 0 rlen 1 "
" { align1 1Q @1 $1 };\n",
offset_reg, base_addr, send_suffix, data_reg);
executor_macro_syncnop(ec, src, "@syncnop");
break;
}
case 125: {
ralloc_asprintf_append(src,
"mul(8) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n"
"add(8) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n"
"send(8) %sUD g127UD nullUD 0x02100500 0x00000000\n"
" ugm MsgDesc: ( load, a32, d32, x, L1STATE_L3MOCS dst_len = 1, "
" src0_len = 1, flat ) src1_len = 0 base_offset 0 "
" { align1 1Q A@1 $1 };\n",
offset_reg, base_addr, data_reg);
executor_macro_syncnop(ec, src, "@syncnop");
break;
}
case 200: {
ralloc_asprintf_append(src,
"mul(8) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n"
"add(8) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n"
"send(8) %sUD g127UD nullUD 0x02100500 0x00000000\n"
" ugm MsgDesc: ( load, a32, d32, x, L1STATE_L3MOCS dst_len = 1, "
" src0_len = 1, flat ) src1_len = 0 base_offset 0 "
" { align1 1Q A@1 $1 };\n",
offset_reg, base_addr, data_reg);
executor_macro_syncnop(ec, src, "@syncnop");
break;
}
default:
unreachable("invalid gfx version");
}
}
static char *
find_macro_symbol(char *line)
{
char *c = line;
while (isspace(*c)) c++;
return *c == '@' ? c : NULL;
}
static bool
match_macro_name(const char *name, const char *line)
{
if (!startswith(name, line))
return false;
line += strlen(name);
return !*line || isspace(*line);
}
const char *
executor_apply_macros(executor_context *ec, const char *original_src)
{
char *scratch = ralloc_strdup(ec->mem_ctx, original_src);
/* Create a ralloc'ed empty string so can call append to it later. */
char *src = ralloc_strdup(ec->mem_ctx, "");
/* TODO: Create a @send macro for common combinations of MsgDesc. */
static const struct {
const char *name;
void (*func)(executor_context *ec, char **output, char *line);
} macros[] = {
{ "@eot", executor_macro_eot },
{ "@mov", executor_macro_mov },
{ "@write", executor_macro_write },
{ "@read", executor_macro_read },
{ "@id", executor_macro_id },
{ "@syncnop", executor_macro_syncnop },
};
char *next = scratch;
while (next) {
char *line = next;
char *end = line;
while (*end && *end != '\n') end++;
next = *end ? end + 1 : NULL;
*end = '\0';
char *macro = find_macro_symbol(line);
if (!macro) {
ralloc_asprintf_append(&src, "%s\n", line);
} else {
bool found = false;
for (int i = 0; i < ARRAY_SIZE(macros); i++) {
if (match_macro_name(macros[i].name, macro)) {
macros[i].func(ec, &src, macro);
found = true;
break;
}
}
if (!found)
failf("unsupported macro line: %s", macro);
}
}
return src;
}

View file

@ -0,0 +1,850 @@
/*
* Copyright © 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <lua.h>
#include <lualib.h>
#include <lauxlib.h>
#include "util/ralloc.h"
#include <xf86drm.h>
#include "drm-uapi/i915_drm.h"
#include "drm-uapi/xe_drm.h"
#include "intel/compiler/brw_asm.h"
#include "intel/compiler/brw_isa_info.h"
#include "intel/common/intel_gem.h"
#include "intel/common/xe/intel_engine.h"
#include "intel/decoder/intel_decoder.h"
#include "intel/dev/intel_debug.h"
#include "executor.h"
enum {
/* Predictable base addresses here make it easier to spot errors. */
EXECUTOR_BO_BATCH_ADDR = 0x10000000,
EXECUTOR_BO_EXTRA_ADDR = 0x20000000,
EXECUTOR_BO_DATA_ADDR = 0x30000000,
/* Apply to all BOs. */
EXECUTOR_BO_SIZE = 10 * 1024 * 1024,
};
static void
print_help()
{
printf(
"Executes shaders written for Intel GPUs\n"
"usage: executor FILENAME\n"
"\n"
"The input is a Lua script that can perform data manipulation\n"
"and dispatch execution of compute shaders, written in Xe assembly,\n"
"the same format used by the brw_asm assembler or when dumping\n"
"shaders in debug mode.\n"
"\n"
"The goal is to have a tool to experiment directly with certain\n"
"assembly instructions and the shared units without having to\n"
"instrument the drivers.\n"
"\n"
"EXECUTION CONTEXT\n"
"\n"
"By default compute shaders are used with SIMD8 for Gfx9-125 and SIMD16\n"
"for Xe2. Only a single thread is dispatched. A data buffer is used to\n"
"pipe data into the shader and out of it, it is bound to the graphics\n"
"address 0x%08x.\n"
"\n"
"The Gfx versions have differences in their assembly and shared units, so\n"
"other than very simple examples, scripts for this program will be either\n"
"specific to a version or provide shader variants for multiple versions.\n"
"\n"
"ASSEMBLY MACROS\n"
"\n"
"In addition to regular instructions, the follow macros will generate\n"
"assembly code based on the Gfx version being executed. Unlike in regular\n"
"instructions, REGs don't use regions and can't be immediates.\n"
"\n"
"- @eot\n"
" Send an EOT message.\n"
"\n"
"- @mov REG IMM\n"
" Like a regular MOV but accepts numbers in both decimal and\n"
" floating-point.\n"
"\n"
"- @id REG\n"
" Write a local invocation index into REG.\n"
"\n"
"- @read DST_REG OFFSET_REG\n"
" Read 32-bit values from the memory buffer at OFFSET_REG into DST_REG.\n"
"\n"
"- @write OFFSET_REG SRC_REG\n"
" Write 32-bit values from SRC_REG to the memory buffer at OFFSET_REG.\n"
"\n"
"- @syncnop\n"
" Produce a coarse grained sync.nop (when applicable) to ensure data from\n"
" macros above are read/written.\n"
"\n"
"LUA ENVIRONMENT\n"
"\n"
"In addition to the regular Lua standard library the following variables and.\n"
"functions are available.\n"
"\n"
"- execute({src=STR, data=ARRAY}) -> ARRAY\n"
" Takes a table as argument. The 'src' in the table contains the shader to be\n"
" executed. The 'data' argument will be used to fill the data buffer with 32-bit\n"
" values. The function returns an ARRAY with the contents of the data buffer\n"
" after the shader completes.\n"
"\n"
"- dump(ARRAY, COUNT)\n"
" Pretty print the COUNT first elements of an array of 32-bit values.\n"
"\n"
"- check_ver(V, ...), check_verx10(V, ...)\n"
" Exit if the Gfx version being executed isn't in the arguments list.\n"
"\n"
"- ver, verx10\n"
" Variables containing the Gfx version being executed.\n"
"\n"
"This program was compiled with %s.\n"
"\n"
"ENVIRONMENT VARIABLES\n"
"\n"
"The following INTEL_DEBUG values (comma separated) are used:\n"
"\n"
" - bat Dumps the batch buffer.\n"
" - color Uses colore for the above.\n"
" - cs Dumps the assembly after macro processing.\n"
"\n"
"EXAMPLE\n"
"\n"
"The following script\n"
"\n"
" local r = execute {\n"
" data={ [42] = 0x100 },\n"
" src=[[\n"
" @mov g1 42\n"
" @read g2 g1\n"
"\n"
" @id g3\n"
"\n"
" add(8) g4<1>UD g2<8,8,1>UD g3<8,8,1>UD { align1 @1 1Q };\n"
"\n"
" @write g3 g4\n"
" ]]\n"
" }\n"
"\n"
" dump(r, 4)\n"
"\n"
"Will produce the following output\n"
"\n"
" [0x00000000] 0x00000100 0x00000101 0x00000102 0x00000103\n"
"\n"
"More examples can be found in the examples/ directory in the source code.\n"
"\n", EXECUTOR_BO_DATA_ADDR, LUA_RELEASE);
}
static struct {
struct intel_device_info devinfo;
struct isl_device isl_dev;
struct brw_isa_info isa;
int fd;
} E;
#define genX_call(func, ...) \
switch (E.devinfo.verx10) { \
case 90: gfx9_ ##func(__VA_ARGS__); break; \
case 110: gfx11_ ##func(__VA_ARGS__); break; \
case 120: gfx12_ ##func(__VA_ARGS__); break; \
case 125: gfx125_##func(__VA_ARGS__); break; \
case 200: gfx20_ ##func(__VA_ARGS__); break; \
default: unreachable("Unsupported hardware generation"); \
}
static void
executor_create_bo(executor_context *ec, executor_bo *bo, uint64_t addr, uint32_t size_in_bytes)
{
if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
struct drm_i915_gem_create gem_create = {
.size = size_in_bytes,
};
int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
if (err)
failf("i915_gem_create");
struct drm_i915_gem_mmap_offset mm = {
.handle = gem_create.handle,
.flags = ec->devinfo->has_local_mem ? I915_MMAP_OFFSET_FIXED
: I915_MMAP_OFFSET_WC,
};
err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mm);
if (err)
failf("i915_gem_mmap_offset");
bo->handle = gem_create.handle;
bo->map = mmap(NULL, size_in_bytes, PROT_READ | PROT_WRITE,
MAP_SHARED, ec->fd, mm.offset);
if (!bo->map)
failf("mmap");
} else {
assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
struct drm_xe_gem_create gem_create = {
.size = size_in_bytes,
.cpu_caching = DRM_XE_GEM_CPU_CACHING_WB,
.placement = 1u << ec->devinfo->mem.sram.mem.instance,
};
int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_GEM_CREATE, &gem_create);
if (err)
failf("xe_gem_create");
struct drm_xe_gem_mmap_offset mm = {
.handle = gem_create.handle,
};
err = intel_ioctl(ec->fd, DRM_IOCTL_XE_GEM_MMAP_OFFSET, &mm);
if (err)
failf("xe_gem_mmap_offset");
bo->handle = gem_create.handle;
bo->map = mmap(NULL, size_in_bytes, PROT_READ | PROT_WRITE,
MAP_SHARED, ec->fd, mm.offset);
if (!bo->map)
failf("mmap");
}
bo->size = size_in_bytes;
bo->addr = addr;
bo->cursor = bo->map;
}
static void
executor_destroy_bo(executor_context *ec, executor_bo *bo)
{
struct drm_gem_close gem_close = {
.handle = bo->handle,
};
int err = munmap(bo->map, bo->size);
if (err)
failf("munmap");
err = intel_ioctl(ec->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
if (err)
failf("gem_close");
memset(bo, 0, sizeof(*bo));
}
static void
executor_print_bo(executor_bo *bo, const char *name)
{
assert((bo->cursor - bo->map) % 4 == 0);
uint32_t *dw = bo->map;
uint32_t len = (uint32_t *)bo->cursor - dw;
printf("=== %s (0x%08lx, %lu bytes) ===\n", name, bo->addr, bo->cursor - bo->map);
for (int i = 0; i < len; i++) {
if ((i % 8) == 0) printf("[0x%08x] ", (i*4) + (uint32_t)bo->addr);
printf("0x%08x ", dw[i]);
if ((i % 8) == 7) printf("\n");
}
printf("\n");
}
void *
executor_alloc_bytes(executor_bo *bo, uint32_t size)
{
return executor_alloc_bytes_aligned(bo, size, 0);
}
void *
executor_alloc_bytes_aligned(executor_bo *bo, uint32_t size, uint32_t alignment)
{
void *r = bo->cursor;
if (alignment) {
r = (void *)(((uintptr_t)r + alignment-1) & ~((uintptr_t)alignment-1));
}
bo->cursor = r + size;
return r;
}
executor_address
executor_address_of_ptr(executor_bo *bo, void *ptr)
{
return (executor_address){ptr - bo->map + bo->addr};
}
static int
get_drm_device(struct intel_device_info *devinfo)
{
drmDevicePtr devices[8];
int max_devices = drmGetDevices2(0, devices, 8);
int i, fd = -1;
for (i = 0; i < max_devices; i++) {
if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER &&
devices[i]->bustype == DRM_BUS_PCI &&
devices[i]->deviceinfo.pci->vendor_id == 0x8086) {
fd = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR | O_CLOEXEC);
if (fd < 0)
continue;
if (!intel_get_device_info_from_fd(fd, devinfo, -1, -1) ||
devinfo->ver < 8) {
close(fd);
fd = -1;
continue;
}
/* Found a device! */
break;
}
}
return fd;
}
static struct intel_batch_decode_bo
decode_get_bo(void *_ec, bool ppgtt, uint64_t address)
{
executor_context *ec = _ec;
struct intel_batch_decode_bo bo = {0};
if (address >= ec->bo.batch.addr && address < ec->bo.batch.addr + ec->bo.batch.size) {
bo.addr = ec->bo.batch.addr;
bo.size = ec->bo.batch.size;
bo.map = ec->bo.batch.map;
} else if (address >= ec->bo.extra.addr && address < ec->bo.extra.addr + ec->bo.extra.size) {
bo.addr = ec->bo.extra.addr;
bo.size = ec->bo.extra.size;
bo.map = ec->bo.extra.map;
} else if (address >= ec->bo.data.addr && address < ec->bo.data.addr + ec->bo.data.size) {
bo.addr = ec->bo.data.addr;
bo.size = ec->bo.data.size;
bo.map = ec->bo.data.map;
}
return bo;
}
static unsigned
decode_get_state_size(void *_ec, uint64_t address, uint64_t base_address)
{
return EXECUTOR_BO_SIZE;
}
static void
parse_execute_data(executor_context *ec, lua_State *L, int table_idx)
{
uint32_t *data = ec->bo.data.map;
lua_pushvalue(L, table_idx);
lua_pushnil(L);
while (lua_next(L, -2) != 0) {
int val_idx = lua_gettop(L);
int key_idx = val_idx - 1;
if (lua_type(L, key_idx) != LUA_TNUMBER || !lua_isinteger(L, key_idx))
failf("invalid key for data in execute call");
lua_Integer key = lua_tointeger(L, key_idx);
assert(key <= 10 * 1024 * 1024 / 4);
lua_Integer val = lua_tointeger(L, val_idx);
data[key] = val;
lua_pop(L, 1);
}
lua_pop(L, 1);
}
static void
parse_execute_args(executor_context *ec, lua_State *L, executor_params *params)
{
int opts = lua_gettop(L);
lua_pushnil(L);
while (lua_next(L, opts) != 0) {
int val_idx = lua_gettop(L);
int key_idx = val_idx - 1;
if (lua_type(L, key_idx) != LUA_TSTRING) {
lua_pop(L, 1);
continue;
}
const char *key = lua_tostring(L, key_idx);
if (!strcmp(key, "src")) {
params->original_src = ralloc_strdup(ec->mem_ctx, luaL_checkstring(L, val_idx));
} else if (!strcmp(key, "data")) {
parse_execute_data(ec, L, val_idx);
} else {
failf("unknown parameter '%s' for execute()", key);
}
lua_pop(L, 1);
}
}
static void
executor_context_setup(executor_context *ec)
{
if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
struct drm_i915_gem_context_create create = {0};
int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
if (err)
failf("i915_gem_context_create");
ec->i915.ctx_id = create.ctx_id;
} else {
assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
struct drm_xe_vm_create create = {
.flags = DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE,
};
int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_CREATE, &create);
if (err)
failf("xe_vm_create");
ec->xe.vm_id = create.vm_id;
struct drm_xe_engine_class_instance instance = {0};
struct intel_query_engine_info *engines_info = xe_engine_get_info(ec->fd);
assert(engines_info);
bool found_engine = false;
for (int i = 0; i < engines_info->num_engines; i++) {
struct intel_engine_class_instance *e = &engines_info->engines[i];
if (e->engine_class == INTEL_ENGINE_CLASS_RENDER) {
instance.engine_class = DRM_XE_ENGINE_CLASS_RENDER;
instance.engine_instance = e->engine_instance;
instance.gt_id = e->gt_id;
found_engine = true;
break;
}
}
assert(found_engine);
struct drm_xe_exec_queue_create queue_create = {
.vm_id = ec->xe.vm_id,
.width = 1,
.num_placements = 1,
.instances = (uintptr_t)&instance,
};
err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &queue_create);
if (err)
failf("xe_exec_queue_create");
ec->xe.queue_id = queue_create.exec_queue_id;
}
executor_create_bo(ec, &ec->bo.batch, EXECUTOR_BO_BATCH_ADDR, EXECUTOR_BO_SIZE);
executor_create_bo(ec, &ec->bo.extra, EXECUTOR_BO_EXTRA_ADDR, EXECUTOR_BO_SIZE);
executor_create_bo(ec, &ec->bo.data, EXECUTOR_BO_DATA_ADDR, EXECUTOR_BO_SIZE);
uint32_t *data = ec->bo.data.map;
for (int i = 0; i < EXECUTOR_BO_SIZE / 4; i++)
data[i] = 0xABABABAB;
}
static void
executor_context_dispatch(executor_context *ec)
{
if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
struct drm_i915_gem_exec_object2 objs[] = {
{
.handle = ec->bo.batch.handle,
.offset = ec->bo.batch.addr,
.flags = EXEC_OBJECT_PINNED,
},
{
.handle = ec->bo.extra.handle,
.offset = ec->bo.extra.addr,
.flags = EXEC_OBJECT_PINNED,
},
{
.handle = ec->bo.data.handle,
.offset = ec->bo.data.addr,
.flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE,
},
};
struct drm_i915_gem_execbuffer2 exec = {0};
exec.buffers_ptr = (uintptr_t)objs;
exec.buffer_count = ARRAY_SIZE(objs);
exec.batch_start_offset = ec->batch_start - ec->bo.batch.addr;
exec.flags = I915_EXEC_BATCH_FIRST;
exec.rsvd1 = ec->i915.ctx_id;
int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &exec);
if (err)
failf("i915_gem_execbuffer2");
struct drm_i915_gem_wait wait = {0};
wait.bo_handle = ec->bo.batch.handle;
wait.timeout_ns = INT64_MAX;
err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
if (err)
failf("i915_gem_wait");
} else {
assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
/* First syncobj is signalled by the binding operation and waited by the
* execution of the batch buffer.
*
* Second syncobj is singalled by the execution of batch buffer and
* waited at the end.
*/
uint32_t sync_handles[2] = {0};
for (int i = 0; i < 2; i++) {
struct drm_syncobj_create sync_create = {0};
int err = intel_ioctl(ec->fd, DRM_IOCTL_SYNCOBJ_CREATE, &sync_create);
if (err)
failf("syncobj_create");
sync_handles[i] = sync_create.handle;
}
struct drm_xe_vm_bind_op bind_ops[] = {
{
.op = DRM_XE_VM_BIND_OP_MAP,
.obj = ec->bo.batch.handle,
.addr = ec->bo.batch.addr,
.range = EXECUTOR_BO_SIZE,
.pat_index = ec->devinfo->pat.cached_coherent.index,
},
{
.op = DRM_XE_VM_BIND_OP_MAP,
.obj = ec->bo.extra.handle,
.addr = ec->bo.extra.addr,
.range = EXECUTOR_BO_SIZE,
.pat_index = ec->devinfo->pat.cached_coherent.index,
},
{
.op = DRM_XE_VM_BIND_OP_MAP,
.obj = ec->bo.data.handle,
.addr = ec->bo.data.addr,
.range = EXECUTOR_BO_SIZE,
.pat_index = ec->devinfo->pat.cached_coherent.index,
},
};
struct drm_xe_sync bind_syncs[] = {
{
.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
.handle = sync_handles[0],
.flags = DRM_XE_SYNC_FLAG_SIGNAL,
},
};
struct drm_xe_vm_bind bind = {
.vm_id = ec->xe.vm_id,
.num_binds = ARRAY_SIZE(bind_ops),
.vector_of_binds = (uintptr_t)bind_ops,
.num_syncs = 1,
.syncs = (uintptr_t)bind_syncs,
};
int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_BIND, &bind);
if (err)
failf("xe_vm_bind");
struct drm_xe_sync exec_syncs[] = {
{
.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
.handle = sync_handles[0],
},
{
.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
.handle = sync_handles[1],
.flags = DRM_XE_SYNC_FLAG_SIGNAL,
}
};
struct drm_xe_exec exec = {
.exec_queue_id = ec->xe.queue_id,
.num_batch_buffer = 1,
.address = ec->batch_start,
.num_syncs = 2,
.syncs = (uintptr_t)exec_syncs,
};
err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC, &exec);
if (err)
failf("xe_exec");
struct drm_syncobj_wait wait = {
.count_handles = 1,
.handles = (uintptr_t)&sync_handles[1],
.timeout_nsec = INT64_MAX,
};
err = intel_ioctl(ec->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait);
if (err)
failf("syncobj_wait");
}
}
static void
executor_context_teardown(executor_context *ec)
{
executor_destroy_bo(ec, &ec->bo.batch);
executor_destroy_bo(ec, &ec->bo.extra);
executor_destroy_bo(ec, &ec->bo.data);
if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
struct drm_i915_gem_context_destroy destroy = {
.ctx_id = ec->i915.ctx_id,
};
int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
if (err)
failf("i915_gem_context_destroy");
} else {
assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
struct drm_xe_exec_queue_destroy queue_destroy = {
.exec_queue_id = ec->xe.queue_id,
};
int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC_QUEUE_DESTROY, &queue_destroy);
if (err)
failf("xe_exec_queue_destroy");
struct drm_xe_vm_destroy destroy = {
.vm_id = ec->xe.vm_id,
};
err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_DESTROY, &destroy);
if (err)
failf("xe_vm_destroy");
}
}
static int
l_execute(lua_State *L)
{
executor_context ec = {
.mem_ctx = ralloc_context(NULL),
.devinfo = &E.devinfo,
.isl_dev = &E.isl_dev,
.fd = E.fd,
};
executor_context_setup(&ec);
executor_params params = {0};
{
if (lua_gettop(L) != 1)
failf("execute() must have a single table argument");
parse_execute_args(&ec, L, &params);
const char *src = executor_apply_macros(&ec, params.original_src);
FILE *f = fmemopen((void *)src, strlen(src), "r");
brw_assemble_result asm = brw_assemble(ec.mem_ctx, ec.devinfo, f, "", 0);
fclose(f);
if (INTEL_DEBUG(DEBUG_CS) || !asm.bin) {
printf("=== Processed assembly source ===\n"
"%s"
"=================================\n\n", src);
}
if (!asm.bin)
failf("assembler failure");
params.kernel_bin = asm.bin;
params.kernel_size = asm.bin_size;
}
genX_call(emit_execute, &ec, &params);
if (INTEL_DEBUG(DEBUG_BATCH)) {
struct intel_batch_decode_ctx decoder;
enum intel_batch_decode_flags flags = INTEL_BATCH_DECODE_DEFAULT_FLAGS;
if (INTEL_DEBUG(DEBUG_COLOR))
flags |= INTEL_BATCH_DECODE_IN_COLOR;
intel_batch_decode_ctx_init_brw(&decoder, &E.isa, &E.devinfo, stdout,
flags, NULL, decode_get_bo, decode_get_state_size, &ec);
assert(ec.bo.batch.cursor > ec.bo.batch.map);
const int batch_offset = ec.batch_start - ec.bo.batch.addr;
const int batch_size = (ec.bo.batch.cursor - ec.bo.batch.map) - batch_offset;
assert(batch_offset < batch_size);
intel_print_batch(&decoder, ec.bo.batch.map, batch_size, ec.batch_start, false);
intel_batch_decode_ctx_finish(&decoder);
}
executor_context_dispatch(&ec);
{
/* TODO: Use userdata to return a wrapped C array instead of building
* values. Could make integration with array operations better.
*/
uint32_t *data = ec.bo.data.map;
const int n = ec.bo.data.size / 4;
lua_createtable(L, n, 0);
for (int i = 0; i < 8; i++) {
lua_pushinteger(L, data[i]);
lua_seti(L, -2, i);
}
}
executor_context_teardown(&ec);
ralloc_free(ec.mem_ctx);
return 1;
}
static int
l_dump(lua_State *L)
{
/* TODO: Use a table to add options for the dump, e.g.
* starting offset, format, etc.
*/
assert(lua_type(L, 1) == LUA_TTABLE);
assert(lua_type(L, 2) == LUA_TNUMBER);
assert(lua_isinteger(L, 2));
lua_Integer len_ = lua_tointeger(L, 2);
assert(len_ >= 0 && len_ <= INT_MAX);
int len = len_;
int i;
for (i = 0; i < len; i++) {
if (i%8 == 0) printf("[0x%08x]", i * 4);
lua_rawgeti(L, 1, i);
lua_Integer val = lua_tointeger(L, -1);
printf(" 0x%08x", (uint32_t)val);
lua_pop(L, 1);
if (i%8 == 7) printf("\n");
}
if (i%8 != 0) printf("\n");
return 0;
}
static int
l_check_ver(lua_State *L)
{
int top = lua_gettop(L);
for (int i = 1; i <= top; i++) {
lua_Integer v = luaL_checknumber(L, i);
if (E.devinfo.ver == v) {
return 0;
}
}
failf("script doesn't support version=%d verx10=%d\n",
E.devinfo.ver, E.devinfo.verx10);
return 0;
}
static int
l_check_verx10(lua_State *L)
{
int top = lua_gettop(L);
for (int i = 1; i <= top; i++) {
lua_Integer v = luaL_checknumber(L, i);
if (E.devinfo.verx10 == v) {
return 0;
}
}
failf("script doesn't support version=%d verx10=%d\n",
E.devinfo.ver, E.devinfo.verx10);
return 0;
}
/* TODO: Review numeric limits in the code, specially around Lua integer
* conversion.
*/
int
main(int argc, char *argv[])
{
if (argc < 2 ||
!strcmp(argv[1], "--help") ||
!strcmp(argv[1], "-help") ||
!strcmp(argv[1], "-h") ||
!strcmp(argv[1], "help")) {
print_help();
return 0;
}
if (argc > 2) {
/* TODO: Expose extra arguments to the script as a variable. */
failf("invalid extra arguments\nusage: executor FILENAME");
return 1;
}
process_intel_debug_variable();
E.fd = get_drm_device(&E.devinfo);
isl_device_init(&E.isl_dev, &E.devinfo);
brw_init_isa_info(&E.isa, &E.devinfo);
assert(E.devinfo.kmd_type == INTEL_KMD_TYPE_I915 ||
E.devinfo.kmd_type == INTEL_KMD_TYPE_XE);
lua_State *L = luaL_newstate();
/* TODO: Could be nice to export some kind of builder interface,
* maybe even let the script construct a shader at the BRW IR
* level and let the later passes kick in.
*/
luaL_openlibs(L);
lua_pushinteger(L, E.devinfo.ver);
lua_setglobal(L, "ver");
lua_pushinteger(L, E.devinfo.verx10);
lua_setglobal(L, "verx10");
lua_pushcfunction(L, l_execute);
lua_setglobal(L, "execute");
lua_pushcfunction(L, l_dump);
lua_setglobal(L, "dump");
lua_pushcfunction(L, l_check_ver);
lua_setglobal(L, "check_ver");
lua_pushcfunction(L, l_check_verx10);
lua_setglobal(L, "check_verx10");
const char *filename = argv[1];
int err = luaL_loadfile(L, filename);
if (err)
failf("failed to load script: %s", lua_tostring(L, -1));
err = lua_pcall(L, 0, 0, 0);
if (err)
failf("failed to run script: %s", lua_tostring(L, -1));
lua_close(L);
close(E.fd);
return 0;
}
void
failf(const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
fprintf(stderr, "ERROR: ");
vfprintf(stderr, fmt, args);
fprintf(stderr, "\n");
va_end(args);
exit(1);
}

View file

@ -0,0 +1,58 @@
# Copyright © 2024 Intel Corporation
# SPDX-License-Identifier: MIT
if not dep_lua.found()
subdir_done()
endif
executor_flags = [
no_override_init_args,
sse2_args,
]
executor_includes = [
inc_include,
inc_src,
inc_intel,
]
executor_hw_libs = []
foreach v: ['90', '110', '120', '125', '200']
executor_hw_libs += static_library(
'executor_hw_ver@0@'.format(v),
['executor_genx.c', gen_xml_pack],
include_directories: [executor_includes],
c_args: [
executor_flags,
'-DGFX_VERx10=@0@'.format(v),
],
gnu_symbol_visibility: 'hidden',
dependencies: [
dep_valgrind,
idep_genxml,
],
)
endforeach
executor = executable(
'executor',
[
'executor_main.c',
'executor_macros.c',
],
dependencies: [
dep_libdrm,
dep_lua,
dep_valgrind,
idep_brw_asm,
idep_genxml,
idep_intel_decoder_brw,
idep_intel_dev,
idep_libintel_common,
],
include_directories: [executor_includes],
link_with: [executor_hw_libs],
c_args: [executor_flags],
gnu_symbol_visibility: 'hidden',
install: true
)

View file

@ -23,6 +23,7 @@ if with_intel_hasvk or with_intel_vk or with_gallium_iris
endif
if with_intel_tools
subdir('tools')
subdir('executor')
endif
if get_option('vulkan-layers').contains('intel-nullhw')
subdir('nullhw-layer')