diff --git a/src/intel/executor/examples/bfi.lua b/src/intel/executor/examples/bfi.lua new file mode 100644 index 00000000000..1142095ce14 --- /dev/null +++ b/src/intel/executor/examples/bfi.lua @@ -0,0 +1,41 @@ +-- BFI seems available on Gfx9, need to fix the emission code for that. +check_verx10(110, 120, 125, 200) + +function BFI_simulation(a, b, c, d) + local width = a & 0x1F + local offset = b & 0x1F + local mask = ((1 << width) - 1) << offset + return ((c << offset) & mask) | (d & ~mask) +end + +function BFI(a, b, c, d) + local r = execute { + data = { [0] = a, b, c, d }, + src = [[ + @id g9 + @mov g11 0 + @mov g12 1 + @mov g13 2 + @mov g14 3 + + @read g1 g11 + @read g2 g12 + @read g3 g13 + @read g4 g14 + + bfi1(8) g5<1>UD g1<8,8,1>UD g2<8,8,1>UD { align1 @1 1Q }; + bfi2(8) g6<1>UD g5<8,8,1>UD g3<8,8,1>UD g4<8,8,1>UD { align1 @1 1Q }; + + @write g9 g6 + @eot + ]], + } + return r[0] +end + +function Hex(v) return string.format("0x%08x", v) end + +local a, b, c, d = 12, 12, 0xAAAAAAAA, 0xBBBBBBBB + +print("calculated", Hex(BFI(a, b, c, d))) +print("expected", Hex(BFI_simulation(a, b, c, d))) diff --git a/src/intel/executor/examples/dp4a.lua b/src/intel/executor/examples/dp4a.lua new file mode 100644 index 00000000000..f1f86845b21 --- /dev/null +++ b/src/intel/executor/examples/dp4a.lua @@ -0,0 +1,41 @@ +--[[ + +Execute the example from the Dot Product 4 Accumulate +instruction as seen in the PRM. + + mov (1) r1.0:d 0x0102037F:d + // (char4)(0x1,0x2,0x3,0x7F) + mov (1) r2.0:d 50:d + dp4a (1) r3.0:d r2:d r1:d r1:d + // r3.0 = 50 + (0x1*0x1 + 0x2*0x2 + 0x3*0x3 + 0x7F*0x7F) + // = 50 + (1 + 4 + 9 + 16129) + // = 16193 + +--]] + +check_ver(12) + +function DP4A(a, b, c) + local r = c + for i = 1, 4 do + r = r + a[i] * b[i] + end + return r +end + +local r = execute { + src = [[ + @id g9 + + @mov g1 0x0102037F + @mov g2 50 + + dp4a(8) g3<1>UD g2<8,8,1>UD g1<8,8,1>UD g1<8,8,1>UD { align1 @1 1Q }; + + @write g9 g3 + @eot + ]], +} + +print("expected", DP4A({1,2,3,0x7F}, {1,2,3,0x7F}, 50)) +print("calculated", r[0]) diff --git a/src/intel/executor/examples/help_example.lua b/src/intel/executor/examples/help_example.lua new file mode 100644 index 00000000000..59268b80ab9 --- /dev/null +++ b/src/intel/executor/examples/help_example.lua @@ -0,0 +1,18 @@ +-- Example from the help message. + +local r = execute { + data={ [42] = 0x100 }, + src=[[ + @mov g1 42 + @read g2 g1 + + @id g3 + + add(8) g4<1>UD g2<8,8,1>UD g3<8,8,1>UD { align1 @1 1Q }; + + @write g3 g4 + @eot + ]] +} + +dump(r, 4) diff --git a/src/intel/executor/examples/nop.lua b/src/intel/executor/examples/nop.lua new file mode 100644 index 00000000000..16e76a3c765 --- /dev/null +++ b/src/intel/executor/examples/nop.lua @@ -0,0 +1,6 @@ +execute { + src = [[ + nop; + @eot + ]], +} diff --git a/src/intel/executor/examples/test.lua b/src/intel/executor/examples/test.lua new file mode 100644 index 00000000000..89424e36e86 --- /dev/null +++ b/src/intel/executor/examples/test.lua @@ -0,0 +1,20 @@ +local data = {} +for i = 0, 8-1 do + data[i] = i * 4 +end + +local r = execute { + data = data, + src = [[ + @id g1 + @read g3 g1 + + add(8) g3<1>UD g3<8,8,1>UD 0x100UD { align1 1Q }; + + @write g1 g3 + + @eot + ]], +} + +dump(r, 8) diff --git a/src/intel/executor/executor.h b/src/intel/executor/executor.h new file mode 100644 index 00000000000..a4d777494f9 --- /dev/null +++ b/src/intel/executor/executor.h @@ -0,0 +1,94 @@ +/* + * Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#ifndef EXECUTOR_H +#define EXECUTOR_H + +#include + +#include "intel/dev/intel_device_info.h" +#include "intel/isl/isl.h" + +typedef struct { + uint32_t size; + uint32_t handle; + void *map; + void *cursor; + uint64_t addr; +} executor_bo; + +typedef struct { + void *mem_ctx; + + struct intel_device_info *devinfo; + struct isl_device *isl_dev; + int fd; + + struct { + uint32_t ctx_id; + } i915; + + struct { + uint32_t vm_id; + uint32_t queue_id; + } xe; + + struct { + executor_bo batch; + executor_bo extra; + executor_bo data; + } bo; + + uint64_t batch_start; +} executor_context; + +typedef struct { + const char *original_src; + + void *kernel_bin; + uint32_t kernel_size; +} executor_params; + +typedef struct { + uint64_t offset; +} executor_address; + +__attribute__((unused)) static uint64_t +executor_combine_address(void *data, void *location, + executor_address address, uint32_t delta) +{ + return address.offset + delta; +} + +executor_address executor_address_of_ptr(executor_bo *bo, void *ptr); + +void *executor_alloc_bytes(executor_bo *bo, uint32_t size); +void *executor_alloc_bytes_aligned(executor_bo *bo, uint32_t size, uint32_t alignment); + +void failf(const char *fmt, ...) PRINTFLIKE(1, 2); + +const char *executor_apply_macros(executor_context *ec, const char *original_src); + +#ifdef genX +# include "executor_genx.h" +#else +# define genX(x) gfx9_##x +# include "executor_genx.h" +# undef genX +# define genX(x) gfx11_##x +# include "executor_genx.h" +# undef genX +# define genX(x) gfx12_##x +# include "executor_genx.h" +# undef genX +# define genX(x) gfx125_##x +# include "executor_genx.h" +# undef genX +# define genX(x) gfx20_##x +# include "executor_genx.h" +# undef genX +#endif + +#endif /* EXECUTOR_H */ diff --git a/src/intel/executor/executor_genx.c b/src/intel/executor/executor_genx.c new file mode 100644 index 00000000000..a5548dc1d25 --- /dev/null +++ b/src/intel/executor/executor_genx.c @@ -0,0 +1,183 @@ +/* + * Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "executor.h" + +#ifdef HAVE_VALGRIND +#include +#include +#define VG(x) x +#else +#define VG(x) ((void)0) +#endif + +#define __gen_address_type executor_address +#define __gen_combine_address executor_combine_address +#define __gen_user_data void + +#include "intel/genxml/gen_macros.h" +#include "intel/genxml/genX_pack.h" + +#define __executor_cmd_length(cmd) cmd ## _length +#define __executor_cmd_header(cmd) cmd ## _header +#define __executor_cmd_pack(cmd) cmd ## _pack + +#define executor_batch_emit(cmd, name) \ + for (struct cmd name = { __executor_cmd_header(cmd) }, \ + *_dst = executor_alloc_bytes(&ec->bo.batch, __executor_cmd_length(cmd) * 4); \ + __builtin_expect(_dst != NULL, 1); \ + ({ __executor_cmd_pack(cmd)(0, _dst, &name); \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __executor_cmd_length(cmd) * 4)); \ + _dst = NULL; \ + })) + +static void +emit_pipe_control(executor_context *ec) +{ + executor_batch_emit(GENX(PIPE_CONTROL), pc) { +#if GFX_VER >= 12 + pc.HDCPipelineFlushEnable = true; +#endif + pc.PipeControlFlushEnable = true; + pc.CommandStreamerStallEnable = true; + } +} + +static void +emit_state_base_address(executor_context *ec, uint32_t mocs) +{ + /* Use the full address for everything. */ + const executor_address base_address = {0}; + const uint32_t size = (1 << 20) - 1; + + executor_batch_emit(GENX(STATE_BASE_ADDRESS), sba) { + sba.GeneralStateBaseAddress = base_address; + sba.GeneralStateBaseAddressModifyEnable = true; + sba.GeneralStateBufferSize = size; + sba.GeneralStateBufferSizeModifyEnable = true; + sba.GeneralStateMOCS = mocs; + + sba.DynamicStateBaseAddress = base_address; + sba.DynamicStateBaseAddressModifyEnable = true; + sba.DynamicStateBufferSize = size; + sba.DynamicStateBufferSizeModifyEnable = true; + sba.DynamicStateMOCS = mocs; + + sba.InstructionBaseAddress = base_address; + sba.InstructionBaseAddressModifyEnable = true; + sba.InstructionBufferSize = size; + sba.InstructionBuffersizeModifyEnable = true; + sba.InstructionMOCS = mocs; + + sba.IndirectObjectBaseAddress = base_address; + sba.IndirectObjectBaseAddressModifyEnable = true; + sba.IndirectObjectBufferSize = size; + sba.IndirectObjectBufferSizeModifyEnable = true; + sba.IndirectObjectMOCS = mocs; + + sba.SurfaceStateMOCS = mocs; + sba.StatelessDataPortAccessMOCS = mocs; + +#if GFX_VER >= 11 + sba.BindlessSamplerStateMOCS = mocs; +#endif + sba.BindlessSurfaceStateMOCS = mocs; + +#if GFX_VERx10 >= 125 + sba.L1CacheControl = L1CC_WB; +#endif + }; +} + +void +genX(emit_execute)(executor_context *ec, const executor_params *params) +{ + uint32_t *kernel = executor_alloc_bytes(&ec->bo.extra, params->kernel_size); + memcpy(kernel, params->kernel_bin, params->kernel_size); + executor_address kernel_addr = executor_address_of_ptr(&ec->bo.extra, kernel); + + /* TODO: Let SIMD be a parameter. */ + + struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { + .KernelStartPointer = kernel_addr.offset, + .NumberofThreadsinGPGPUThreadGroup = 1, + }; + + void *b = executor_alloc_bytes_aligned(&ec->bo.batch, 0, 256); + ec->batch_start = executor_address_of_ptr(&ec->bo.batch, b).offset; + + emit_pipe_control(ec); + +#if GFX_VERx10 < 200 + executor_batch_emit(GENX(PIPELINE_SELECT), ps) { + ps.PipelineSelection = GPGPU; + ps.MaskBits = 0x3; + } + emit_pipe_control(ec); +#endif + + const uint32_t mocs = isl_mocs(ec->isl_dev, 0, false); + + emit_state_base_address(ec, mocs); + +#if GFX_VERx10 >= 125 + executor_batch_emit(GENX(STATE_COMPUTE_MODE), cm) { + cm.Mask1 = 0xffff; +#if GFX_VERx10 >= 200 + cm.Mask2 = 0xffff; +#endif + } + + executor_batch_emit(GENX(CFE_STATE), cfe) { + cfe.MaximumNumberofThreads = 64; + } +#else + executor_batch_emit(GENX(MEDIA_VFE_STATE), vfe) { + vfe.NumberofURBEntries = 2; + vfe.MaximumNumberofThreads = 64; + } +#endif + + emit_pipe_control(ec); + +#if GFX_VERx10 >= 125 + executor_batch_emit(GENX(COMPUTE_WALKER), cw) { +#if GFX_VERx10 >= 200 + cw.SIMDSize = 1; + cw.MessageSIMD = 1; +#endif + cw.ThreadGroupIDXDimension = 1; + cw.ThreadGroupIDYDimension = 1; + cw.ThreadGroupIDZDimension = 1; + cw.ExecutionMask = 0xFFFFFFFF; + cw.PostSync.MOCS = mocs; + cw.InterfaceDescriptor = desc; + }; +#else + uint32_t *idd = executor_alloc_bytes_aligned(&ec->bo.extra, 8 * 4, 256); + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, idd, &desc); + + executor_address idd_addr = executor_address_of_ptr(&ec->bo.extra, idd); + + executor_batch_emit(GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) { + load.InterfaceDescriptorDataStartAddress = idd_addr.offset, + load.InterfaceDescriptorTotalLength = 8 * 4; + } + + executor_batch_emit(GENX(GPGPU_WALKER), gw) { + gw.ThreadGroupIDXDimension = 1; + gw.ThreadGroupIDYDimension = 1; + gw.ThreadGroupIDZDimension = 1; + gw.RightExecutionMask = 0xFFFFFFFF; + gw.BottomExecutionMask = 0xFFFFFFFF; + } + + executor_batch_emit(GENX(MEDIA_STATE_FLUSH), msf); +#endif + + emit_pipe_control(ec); + + executor_batch_emit(GENX(MI_BATCH_BUFFER_END), end); +} diff --git a/src/intel/executor/executor_genx.h b/src/intel/executor/executor_genx.h new file mode 100644 index 00000000000..1fa1121ced4 --- /dev/null +++ b/src/intel/executor/executor_genx.h @@ -0,0 +1,10 @@ +/* + * Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#ifndef EXECUTOR_H +#error This file must be included via executor.h +#endif + +void genX(emit_execute)(executor_context *ec, const executor_params *params); diff --git a/src/intel/executor/executor_macros.c b/src/intel/executor/executor_macros.c new file mode 100644 index 00000000000..8360426f338 --- /dev/null +++ b/src/intel/executor/executor_macros.c @@ -0,0 +1,407 @@ +/* + * Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include + +#include "util/ralloc.h" +#include "intel/compiler/brw_asm.h" + +#include "executor.h" + +static bool +startswith(const char *prefix, const char *s) +{ + return !strncmp(prefix, s, strlen(prefix)); +} + +static char * +skip_prefix(char *prefix, char *start) +{ + assert(startswith(prefix, start)); + char *c = start += strlen(prefix); + return c; +} + +typedef struct { + char **args; + int count; +} parse_args_result; + +static parse_args_result +parse_args(void *mem_ctx, char *c) +{ + parse_args_result r = {0}; + + while (*c) { + /* Skip spaces. */ + while (*c && isspace(*c)) + c++; + if (!*c) + break; + + /* Copy non-spaces. */ + char *start = c; + while (*c && !isspace(*c)) + c++; + r.args = reralloc_array_size(mem_ctx, r.args, sizeof(char *), r.count + 1); + r.args[r.count++] = ralloc_strndup(mem_ctx, start, c - start); + } + + return r; +} + +static void +executor_macro_mov(executor_context *ec, char **src, char *line) +{ + char *c = skip_prefix("@mov", line); + parse_args_result r = parse_args(ec->mem_ctx, c); + + if (r.count != 2) + failf("@mov needs 2 arguments, found %d\n", r.count); + + const char *reg = r.args[0]; + char *value = r.args[1]; + + if (strchr(value, '.')) { + union { + float f; + uint32_t u; + } val; + + val.f = strtof(value, NULL); + + switch (ec->devinfo->verx10) { + case 90: + case 110: + case 120: + case 125: { + ralloc_asprintf_append(src, "mov(8) %s<1>F 0x%08xF /* %f */ { align1 1Q };\n", reg, val.u, val.f); + break; + } + case 200: { + ralloc_asprintf_append(src, "mov(16) %s<1>F 0x%08xF /* %f */ { align1 1H };\n", reg, val.u, val.f); + break; + } + default: + unreachable("invalid gfx version"); + } + + } else { + for (char *c = value; *c; c++) + *c = tolower(*c); + switch (ec->devinfo->verx10) { + case 90: + case 110: + case 120: + case 125: { + ralloc_asprintf_append(src, "mov(8) %s<1>UD %sUD { align1 1Q };\n", reg, value); + break; + } + + case 200: { + ralloc_asprintf_append(src, "mov(16) %s<1>UD %sUD { align1 1H };\n", reg, value); + break; + } + + default: + unreachable("invalid gfx version"); + } + } +} + +static void +executor_macro_syncnop(executor_context *ec, char **src, char *line) +{ + switch (ec->devinfo->verx10) { + case 90: + case 110: { + /* Not needed. */ + break; + } + + case 120: { + ralloc_strcat(src, "sync nop(8) null<0,1,0>UD { align1 WE_all 1H @1 $1 };\n"); + break; + } + + case 125: + case 200: { + ralloc_strcat(src, "sync nop(8) null<0,1,0>UD { align1 WE_all 1H A@1 $1 };\n"); + break; + } + + default: + unreachable("invalid gfx version"); + } +} + +static void +executor_macro_eot(executor_context *ec, char **src, char *line) +{ + switch (ec->devinfo->verx10) { + case 90: + case 110: { + ralloc_strcat(src, + "mov(8) g127<1>UD g0<8;8,1>UD { align1 WE_all 1Q };\n" + "send(8) null<1>UW g127<0,1,0>UD 0x82000010\n" + " thread_spawner MsgDesc: mlen 1 rlen 0 { align1 WE_all 1Q EOT };\n"); + break; + } + case 120: { + ralloc_strcat(src, + "mov(8) g127<1>UD g0<8;8,1>UD { align1 WE_all 1Q };\n" + "send(8) nullUD g127UD nullUD 0x02000000 0x00000000\n" + " thread_spawner MsgDesc: mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q @1 EOT };\n"); + break; + } + + case 125: { + ralloc_strcat(src, + "mov(8) g127<1>UD g0<8;8,1>UD { align1 WE_all 1Q };\n" + "send(8) nullUD g127UD nullUD 0x02000000 0x00000000\n" + " gateway MsgDesc: (open) mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q A@1 EOT };\n"); + break; + } + + case 200: { + ralloc_strcat(src, + "mov(16) g127<1>UD g0<1,1,0>UD { align1 WE_all 1H };\n" + "send(16) nullUD g127UD nullUD 0x02000000 0x00000000\n" + " gateway MsgDesc: (open) mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1H I@1 EOT };\n"); + break; + } + default: + unreachable("invalid gfx version"); + } +} + +static void +executor_macro_id(executor_context *ec, char **src, char *line) +{ + char *c = skip_prefix("@id", line); + parse_args_result r = parse_args(ec->mem_ctx, c); + + if (r.count != 1) + failf("@id needs 1 argument, found %d\n", r.count); + + const char *reg = r.args[0]; + + switch (ec->devinfo->verx10) { + case 90: + case 110: + case 120: + case 125: { + ralloc_asprintf_append(src, + "mov(8) g127<1>UW 0x76543210V { align1 WE_all 1Q };\n" + "mov(8) %s<1>UD g127<8,8,1>UW { align1 WE_all 1Q @1 };\n", reg); + break; + } + + case 200: { + ralloc_asprintf_append(src, + "mov(8) g127<1>UW 0x76543210V { align1 WE_all 1Q };\n" + "add(8) g127.8<1>UW g127<1,1,0>UW 8UW { align1 WE_all 1Q @1 };\n" + "mov(16) %s<1>UD g127<8,8,1>UW { align1 WE_all 1Q @1 };\n", reg); + break; + } + + default: + unreachable("invalid gfx version"); + } +} + +static void +executor_macro_write(executor_context *ec, char **src, char *line) +{ + char *c = skip_prefix("@write", line); + parse_args_result r = parse_args(ec->mem_ctx, c); + + if (r.count != 2) + failf("@write needs 2 arguments, found %d\n", r.count); + + const char *offset_reg = r.args[0]; + const char *data_reg = r.args[1]; + + assert(ec->bo.data.addr <= 0xFFFFFFFF); + uint32_t base_addr = ec->bo.data.addr; + + switch (ec->devinfo->verx10) { + case 90: + case 110: + case 120: { + const char *send_suffix = ec->devinfo->verx10 < 120 ? "s" : ""; + ralloc_asprintf_append(src, + "mul(8) g127<1>UD %s<8;8,1>UD 0x4UW { align1 @1 1Q };\n" + "add(8) g127<1>UD g127<8;8,1>UD 0x%08xUD { align1 @1 1Q };\n" + "send%s(8) nullUD g127UD %sUD 0x2026efd 0x00000040\n" + " dp data 1 MsgDesc: (DC untyped surface write, Surface = 253, " + " SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 " + " { align1 1Q @1 $1 };\n", + offset_reg, base_addr, send_suffix, data_reg); + executor_macro_syncnop(ec, src, "@syncnop"); + break; + } + + case 125: { + ralloc_asprintf_append(src, + "mul(8) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n" + "add(8) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n" + "send(8) nullUD g127UD %sUD 0x02000504 0x00000040\n" + " ugm MsgDesc: ( store, a32, d32, x, L1STATE_L3MOCS dst_len = 0, " + " src0_len = 1, src1_len = 1, flat ) base_offset 0 " + " { align1 1Q A@1 $1 };\n", + offset_reg, base_addr, data_reg); + executor_macro_syncnop(ec, src, "@syncnop"); + break; + } + + case 200: { + ralloc_asprintf_append(src, + "mul(16) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n" + "add(16) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n" + "send(16) nullUD g127UD %sUD 0x02000504 0x00000040\n" + " ugm MsgDesc: ( store, a32, d32, x, L1STATE_L3MOCS dst_len = 0, " + " src0_len = 1, src1_len = 1, flat ) base_offset 0 " + " { align1 1Q A@1 $1 };\n", + offset_reg, base_addr, data_reg); + executor_macro_syncnop(ec, src, "@syncnop"); + break; + } + + default: + unreachable("invalid gfx version"); + } +} + +static void +executor_macro_read(executor_context *ec, char **src, char *line) +{ + char *c = skip_prefix("@read", line); + parse_args_result r = parse_args(ec->mem_ctx, c); + + if (r.count != 2) + failf("@read needs 2 arguments, found %d\n", r.count); + + /* Order follows underlying SEND, destination first. */ + const char *data_reg = r.args[0]; + const char *offset_reg = r.args[1]; + + assert(ec->bo.data.addr <= 0xFFFFFFFF); + uint32_t base_addr = ec->bo.data.addr; + + switch (ec->devinfo->verx10) { + case 90: + case 110: + case 120: { + const char *send_suffix = ec->devinfo->verx10 < 120 ? "s" : ""; + ralloc_asprintf_append(src, + "mul(8) g127<1>UD %s<8;8,1>UD 0x4UW { align1 @1 1Q };\n" + "add(8) g127<1>UD g127<8;8,1>UD 0x%08xUD { align1 @1 1Q };\n" + "send%s(8) %sUD g127UD nullUD 0x2106efd 0x00000000\n" + " dp data 1 MsgDesc: (DC untyped surface read, Surface = 253, " + " SIMD8, Mask = 0xe) mlen 1 ex_mlen 0 rlen 1 " + " { align1 1Q @1 $1 };\n", + offset_reg, base_addr, send_suffix, data_reg); + executor_macro_syncnop(ec, src, "@syncnop"); + break; + } + + case 125: { + ralloc_asprintf_append(src, + "mul(8) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n" + "add(8) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n" + "send(8) %sUD g127UD nullUD 0x02100500 0x00000000\n" + " ugm MsgDesc: ( load, a32, d32, x, L1STATE_L3MOCS dst_len = 1, " + " src0_len = 1, flat ) src1_len = 0 base_offset 0 " + " { align1 1Q A@1 $1 };\n", + offset_reg, base_addr, data_reg); + executor_macro_syncnop(ec, src, "@syncnop"); + break; + } + + case 200: { + ralloc_asprintf_append(src, + "mul(8) g127<1>UD %s<1;1,0>UD 0x4UW { align1 @1 1Q };\n" + "add(8) g127<1>UD g127<1;1,0>UD 0x%08xUD { align1 @1 1Q };\n" + "send(8) %sUD g127UD nullUD 0x02100500 0x00000000\n" + " ugm MsgDesc: ( load, a32, d32, x, L1STATE_L3MOCS dst_len = 1, " + " src0_len = 1, flat ) src1_len = 0 base_offset 0 " + " { align1 1Q A@1 $1 };\n", + offset_reg, base_addr, data_reg); + executor_macro_syncnop(ec, src, "@syncnop"); + break; + } + + default: + unreachable("invalid gfx version"); + } +} + +static char * +find_macro_symbol(char *line) +{ + char *c = line; + while (isspace(*c)) c++; + return *c == '@' ? c : NULL; +} + +static bool +match_macro_name(const char *name, const char *line) +{ + if (!startswith(name, line)) + return false; + line += strlen(name); + return !*line || isspace(*line); +} + +const char * +executor_apply_macros(executor_context *ec, const char *original_src) +{ + char *scratch = ralloc_strdup(ec->mem_ctx, original_src); + + /* Create a ralloc'ed empty string so can call append to it later. */ + char *src = ralloc_strdup(ec->mem_ctx, ""); + + /* TODO: Create a @send macro for common combinations of MsgDesc. */ + static const struct { + const char *name; + void (*func)(executor_context *ec, char **output, char *line); + } macros[] = { + { "@eot", executor_macro_eot }, + { "@mov", executor_macro_mov }, + { "@write", executor_macro_write }, + { "@read", executor_macro_read }, + { "@id", executor_macro_id }, + { "@syncnop", executor_macro_syncnop }, + }; + + char *next = scratch; + while (next) { + char *line = next; + char *end = line; + + while (*end && *end != '\n') end++; + next = *end ? end + 1 : NULL; + *end = '\0'; + + char *macro = find_macro_symbol(line); + if (!macro) { + ralloc_asprintf_append(&src, "%s\n", line); + } else { + bool found = false; + for (int i = 0; i < ARRAY_SIZE(macros); i++) { + if (match_macro_name(macros[i].name, macro)) { + macros[i].func(ec, &src, macro); + found = true; + break; + } + } + if (!found) + failf("unsupported macro line: %s", macro); + } + } + + return src; +} diff --git a/src/intel/executor/executor_main.c b/src/intel/executor/executor_main.c new file mode 100644 index 00000000000..60accc25337 --- /dev/null +++ b/src/intel/executor/executor_main.c @@ -0,0 +1,850 @@ +/* + * Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "util/ralloc.h" + +#include +#include "drm-uapi/i915_drm.h" +#include "drm-uapi/xe_drm.h" + +#include "intel/compiler/brw_asm.h" +#include "intel/compiler/brw_isa_info.h" +#include "intel/common/intel_gem.h" +#include "intel/common/xe/intel_engine.h" +#include "intel/decoder/intel_decoder.h" +#include "intel/dev/intel_debug.h" + +#include "executor.h" + +enum { + /* Predictable base addresses here make it easier to spot errors. */ + EXECUTOR_BO_BATCH_ADDR = 0x10000000, + EXECUTOR_BO_EXTRA_ADDR = 0x20000000, + EXECUTOR_BO_DATA_ADDR = 0x30000000, + + /* Apply to all BOs. */ + EXECUTOR_BO_SIZE = 10 * 1024 * 1024, +}; + +static void +print_help() +{ + printf( + "Executes shaders written for Intel GPUs\n" + "usage: executor FILENAME\n" + "\n" + "The input is a Lua script that can perform data manipulation\n" + "and dispatch execution of compute shaders, written in Xe assembly,\n" + "the same format used by the brw_asm assembler or when dumping\n" + "shaders in debug mode.\n" + "\n" + "The goal is to have a tool to experiment directly with certain\n" + "assembly instructions and the shared units without having to\n" + "instrument the drivers.\n" + "\n" + "EXECUTION CONTEXT\n" + "\n" + "By default compute shaders are used with SIMD8 for Gfx9-125 and SIMD16\n" + "for Xe2. Only a single thread is dispatched. A data buffer is used to\n" + "pipe data into the shader and out of it, it is bound to the graphics\n" + "address 0x%08x.\n" + "\n" + "The Gfx versions have differences in their assembly and shared units, so\n" + "other than very simple examples, scripts for this program will be either\n" + "specific to a version or provide shader variants for multiple versions.\n" + "\n" + "ASSEMBLY MACROS\n" + "\n" + "In addition to regular instructions, the follow macros will generate\n" + "assembly code based on the Gfx version being executed. Unlike in regular\n" + "instructions, REGs don't use regions and can't be immediates.\n" + "\n" + "- @eot\n" + " Send an EOT message.\n" + "\n" + "- @mov REG IMM\n" + " Like a regular MOV but accepts numbers in both decimal and\n" + " floating-point.\n" + "\n" + "- @id REG\n" + " Write a local invocation index into REG.\n" + "\n" + "- @read DST_REG OFFSET_REG\n" + " Read 32-bit values from the memory buffer at OFFSET_REG into DST_REG.\n" + "\n" + "- @write OFFSET_REG SRC_REG\n" + " Write 32-bit values from SRC_REG to the memory buffer at OFFSET_REG.\n" + "\n" + "- @syncnop\n" + " Produce a coarse grained sync.nop (when applicable) to ensure data from\n" + " macros above are read/written.\n" + "\n" + "LUA ENVIRONMENT\n" + "\n" + "In addition to the regular Lua standard library the following variables and.\n" + "functions are available.\n" + "\n" + "- execute({src=STR, data=ARRAY}) -> ARRAY\n" + " Takes a table as argument. The 'src' in the table contains the shader to be\n" + " executed. The 'data' argument will be used to fill the data buffer with 32-bit\n" + " values. The function returns an ARRAY with the contents of the data buffer\n" + " after the shader completes.\n" + "\n" + "- dump(ARRAY, COUNT)\n" + " Pretty print the COUNT first elements of an array of 32-bit values.\n" + "\n" + "- check_ver(V, ...), check_verx10(V, ...)\n" + " Exit if the Gfx version being executed isn't in the arguments list.\n" + "\n" + "- ver, verx10\n" + " Variables containing the Gfx version being executed.\n" + "\n" + "This program was compiled with %s.\n" + "\n" + "ENVIRONMENT VARIABLES\n" + "\n" + "The following INTEL_DEBUG values (comma separated) are used:\n" + "\n" + " - bat Dumps the batch buffer.\n" + " - color Uses colore for the above.\n" + " - cs Dumps the assembly after macro processing.\n" + "\n" + "EXAMPLE\n" + "\n" + "The following script\n" + "\n" + " local r = execute {\n" + " data={ [42] = 0x100 },\n" + " src=[[\n" + " @mov g1 42\n" + " @read g2 g1\n" + "\n" + " @id g3\n" + "\n" + " add(8) g4<1>UD g2<8,8,1>UD g3<8,8,1>UD { align1 @1 1Q };\n" + "\n" + " @write g3 g4\n" + " ]]\n" + " }\n" + "\n" + " dump(r, 4)\n" + "\n" + "Will produce the following output\n" + "\n" + " [0x00000000] 0x00000100 0x00000101 0x00000102 0x00000103\n" + "\n" + "More examples can be found in the examples/ directory in the source code.\n" + "\n", EXECUTOR_BO_DATA_ADDR, LUA_RELEASE); +} + +static struct { + struct intel_device_info devinfo; + struct isl_device isl_dev; + struct brw_isa_info isa; + int fd; +} E; + +#define genX_call(func, ...) \ + switch (E.devinfo.verx10) { \ + case 90: gfx9_ ##func(__VA_ARGS__); break; \ + case 110: gfx11_ ##func(__VA_ARGS__); break; \ + case 120: gfx12_ ##func(__VA_ARGS__); break; \ + case 125: gfx125_##func(__VA_ARGS__); break; \ + case 200: gfx20_ ##func(__VA_ARGS__); break; \ + default: unreachable("Unsupported hardware generation"); \ + } + +static void +executor_create_bo(executor_context *ec, executor_bo *bo, uint64_t addr, uint32_t size_in_bytes) +{ + if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) { + struct drm_i915_gem_create gem_create = { + .size = size_in_bytes, + }; + + int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create); + if (err) + failf("i915_gem_create"); + + struct drm_i915_gem_mmap_offset mm = { + .handle = gem_create.handle, + .flags = ec->devinfo->has_local_mem ? I915_MMAP_OFFSET_FIXED + : I915_MMAP_OFFSET_WC, + }; + + err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mm); + if (err) + failf("i915_gem_mmap_offset"); + + bo->handle = gem_create.handle; + bo->map = mmap(NULL, size_in_bytes, PROT_READ | PROT_WRITE, + MAP_SHARED, ec->fd, mm.offset); + if (!bo->map) + failf("mmap"); + } else { + assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE); + + struct drm_xe_gem_create gem_create = { + .size = size_in_bytes, + .cpu_caching = DRM_XE_GEM_CPU_CACHING_WB, + .placement = 1u << ec->devinfo->mem.sram.mem.instance, + }; + + int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_GEM_CREATE, &gem_create); + if (err) + failf("xe_gem_create"); + + struct drm_xe_gem_mmap_offset mm = { + .handle = gem_create.handle, + }; + + err = intel_ioctl(ec->fd, DRM_IOCTL_XE_GEM_MMAP_OFFSET, &mm); + if (err) + failf("xe_gem_mmap_offset"); + + bo->handle = gem_create.handle; + bo->map = mmap(NULL, size_in_bytes, PROT_READ | PROT_WRITE, + MAP_SHARED, ec->fd, mm.offset); + if (!bo->map) + failf("mmap"); + } + + bo->size = size_in_bytes; + bo->addr = addr; + bo->cursor = bo->map; +} + +static void +executor_destroy_bo(executor_context *ec, executor_bo *bo) +{ + struct drm_gem_close gem_close = { + .handle = bo->handle, + }; + + int err = munmap(bo->map, bo->size); + if (err) + failf("munmap"); + + err = intel_ioctl(ec->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); + if (err) + failf("gem_close"); + + memset(bo, 0, sizeof(*bo)); +} + +static void +executor_print_bo(executor_bo *bo, const char *name) +{ + assert((bo->cursor - bo->map) % 4 == 0); + uint32_t *dw = bo->map; + uint32_t len = (uint32_t *)bo->cursor - dw; + + printf("=== %s (0x%08lx, %lu bytes) ===\n", name, bo->addr, bo->cursor - bo->map); + + for (int i = 0; i < len; i++) { + if ((i % 8) == 0) printf("[0x%08x] ", (i*4) + (uint32_t)bo->addr); + printf("0x%08x ", dw[i]); + if ((i % 8) == 7) printf("\n"); + } + printf("\n"); +} + +void * +executor_alloc_bytes(executor_bo *bo, uint32_t size) +{ + return executor_alloc_bytes_aligned(bo, size, 0); +} + +void * +executor_alloc_bytes_aligned(executor_bo *bo, uint32_t size, uint32_t alignment) +{ + void *r = bo->cursor; + if (alignment) { + r = (void *)(((uintptr_t)r + alignment-1) & ~((uintptr_t)alignment-1)); + } + bo->cursor = r + size; + return r; +} + +executor_address +executor_address_of_ptr(executor_bo *bo, void *ptr) +{ + return (executor_address){ptr - bo->map + bo->addr}; +} + +static int +get_drm_device(struct intel_device_info *devinfo) +{ + drmDevicePtr devices[8]; + int max_devices = drmGetDevices2(0, devices, 8); + + int i, fd = -1; + for (i = 0; i < max_devices; i++) { + if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER && + devices[i]->bustype == DRM_BUS_PCI && + devices[i]->deviceinfo.pci->vendor_id == 0x8086) { + fd = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR | O_CLOEXEC); + if (fd < 0) + continue; + + if (!intel_get_device_info_from_fd(fd, devinfo, -1, -1) || + devinfo->ver < 8) { + close(fd); + fd = -1; + continue; + } + + /* Found a device! */ + break; + } + } + + return fd; +} + +static struct intel_batch_decode_bo +decode_get_bo(void *_ec, bool ppgtt, uint64_t address) +{ + executor_context *ec = _ec; + struct intel_batch_decode_bo bo = {0}; + + if (address >= ec->bo.batch.addr && address < ec->bo.batch.addr + ec->bo.batch.size) { + bo.addr = ec->bo.batch.addr; + bo.size = ec->bo.batch.size; + bo.map = ec->bo.batch.map; + } else if (address >= ec->bo.extra.addr && address < ec->bo.extra.addr + ec->bo.extra.size) { + bo.addr = ec->bo.extra.addr; + bo.size = ec->bo.extra.size; + bo.map = ec->bo.extra.map; + } else if (address >= ec->bo.data.addr && address < ec->bo.data.addr + ec->bo.data.size) { + bo.addr = ec->bo.data.addr; + bo.size = ec->bo.data.size; + bo.map = ec->bo.data.map; + } + + return bo; +} + +static unsigned +decode_get_state_size(void *_ec, uint64_t address, uint64_t base_address) +{ + return EXECUTOR_BO_SIZE; +} + +static void +parse_execute_data(executor_context *ec, lua_State *L, int table_idx) +{ + uint32_t *data = ec->bo.data.map; + + lua_pushvalue(L, table_idx); + + lua_pushnil(L); + while (lua_next(L, -2) != 0) { + int val_idx = lua_gettop(L); + int key_idx = val_idx - 1; + + if (lua_type(L, key_idx) != LUA_TNUMBER || !lua_isinteger(L, key_idx)) + failf("invalid key for data in execute call"); + + lua_Integer key = lua_tointeger(L, key_idx); + assert(key <= 10 * 1024 * 1024 / 4); + lua_Integer val = lua_tointeger(L, val_idx); + data[key] = val; + + lua_pop(L, 1); + } + + lua_pop(L, 1); +} + +static void +parse_execute_args(executor_context *ec, lua_State *L, executor_params *params) +{ + int opts = lua_gettop(L); + + lua_pushnil(L); + + while (lua_next(L, opts) != 0) { + int val_idx = lua_gettop(L); + int key_idx = val_idx - 1; + + if (lua_type(L, key_idx) != LUA_TSTRING) { + lua_pop(L, 1); + continue; + } + + const char *key = lua_tostring(L, key_idx); + + if (!strcmp(key, "src")) { + params->original_src = ralloc_strdup(ec->mem_ctx, luaL_checkstring(L, val_idx)); + } else if (!strcmp(key, "data")) { + parse_execute_data(ec, L, val_idx); + } else { + failf("unknown parameter '%s' for execute()", key); + } + + lua_pop(L, 1); + } +} + +static void +executor_context_setup(executor_context *ec) +{ + if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) { + struct drm_i915_gem_context_create create = {0}; + int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create); + if (err) + failf("i915_gem_context_create"); + ec->i915.ctx_id = create.ctx_id; + } else { + assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE); + + struct drm_xe_vm_create create = { + .flags = DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE, + }; + int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_CREATE, &create); + if (err) + failf("xe_vm_create"); + ec->xe.vm_id = create.vm_id; + + struct drm_xe_engine_class_instance instance = {0}; + + struct intel_query_engine_info *engines_info = xe_engine_get_info(ec->fd); + assert(engines_info); + + bool found_engine = false; + for (int i = 0; i < engines_info->num_engines; i++) { + struct intel_engine_class_instance *e = &engines_info->engines[i]; + if (e->engine_class == INTEL_ENGINE_CLASS_RENDER) { + instance.engine_class = DRM_XE_ENGINE_CLASS_RENDER; + instance.engine_instance = e->engine_instance; + instance.gt_id = e->gt_id; + found_engine = true; + break; + } + } + assert(found_engine); + + struct drm_xe_exec_queue_create queue_create = { + .vm_id = ec->xe.vm_id, + .width = 1, + .num_placements = 1, + .instances = (uintptr_t)&instance, + }; + err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &queue_create); + if (err) + failf("xe_exec_queue_create"); + ec->xe.queue_id = queue_create.exec_queue_id; + } + + executor_create_bo(ec, &ec->bo.batch, EXECUTOR_BO_BATCH_ADDR, EXECUTOR_BO_SIZE); + executor_create_bo(ec, &ec->bo.extra, EXECUTOR_BO_EXTRA_ADDR, EXECUTOR_BO_SIZE); + executor_create_bo(ec, &ec->bo.data, EXECUTOR_BO_DATA_ADDR, EXECUTOR_BO_SIZE); + + uint32_t *data = ec->bo.data.map; + for (int i = 0; i < EXECUTOR_BO_SIZE / 4; i++) + data[i] = 0xABABABAB; +} + +static void +executor_context_dispatch(executor_context *ec) +{ + if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) { + struct drm_i915_gem_exec_object2 objs[] = { + { + .handle = ec->bo.batch.handle, + .offset = ec->bo.batch.addr, + .flags = EXEC_OBJECT_PINNED, + }, + { + .handle = ec->bo.extra.handle, + .offset = ec->bo.extra.addr, + .flags = EXEC_OBJECT_PINNED, + }, + { + .handle = ec->bo.data.handle, + .offset = ec->bo.data.addr, + .flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE, + }, + }; + + struct drm_i915_gem_execbuffer2 exec = {0}; + exec.buffers_ptr = (uintptr_t)objs; + exec.buffer_count = ARRAY_SIZE(objs); + exec.batch_start_offset = ec->batch_start - ec->bo.batch.addr; + exec.flags = I915_EXEC_BATCH_FIRST; + exec.rsvd1 = ec->i915.ctx_id; + + int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &exec); + if (err) + failf("i915_gem_execbuffer2"); + + struct drm_i915_gem_wait wait = {0}; + wait.bo_handle = ec->bo.batch.handle; + wait.timeout_ns = INT64_MAX; + + err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_WAIT, &wait); + if (err) + failf("i915_gem_wait"); + } else { + assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE); + + /* First syncobj is signalled by the binding operation and waited by the + * execution of the batch buffer. + * + * Second syncobj is singalled by the execution of batch buffer and + * waited at the end. + */ + uint32_t sync_handles[2] = {0}; + for (int i = 0; i < 2; i++) { + struct drm_syncobj_create sync_create = {0}; + int err = intel_ioctl(ec->fd, DRM_IOCTL_SYNCOBJ_CREATE, &sync_create); + if (err) + failf("syncobj_create"); + sync_handles[i] = sync_create.handle; + } + + struct drm_xe_vm_bind_op bind_ops[] = { + { + .op = DRM_XE_VM_BIND_OP_MAP, + .obj = ec->bo.batch.handle, + .addr = ec->bo.batch.addr, + .range = EXECUTOR_BO_SIZE, + .pat_index = ec->devinfo->pat.cached_coherent.index, + }, + { + .op = DRM_XE_VM_BIND_OP_MAP, + .obj = ec->bo.extra.handle, + .addr = ec->bo.extra.addr, + .range = EXECUTOR_BO_SIZE, + .pat_index = ec->devinfo->pat.cached_coherent.index, + }, + { + .op = DRM_XE_VM_BIND_OP_MAP, + .obj = ec->bo.data.handle, + .addr = ec->bo.data.addr, + .range = EXECUTOR_BO_SIZE, + .pat_index = ec->devinfo->pat.cached_coherent.index, + }, + }; + + struct drm_xe_sync bind_syncs[] = { + { + .type = DRM_XE_SYNC_TYPE_SYNCOBJ, + .handle = sync_handles[0], + .flags = DRM_XE_SYNC_FLAG_SIGNAL, + }, + }; + + struct drm_xe_vm_bind bind = { + .vm_id = ec->xe.vm_id, + .num_binds = ARRAY_SIZE(bind_ops), + .vector_of_binds = (uintptr_t)bind_ops, + .num_syncs = 1, + .syncs = (uintptr_t)bind_syncs, + }; + + int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_BIND, &bind); + if (err) + failf("xe_vm_bind"); + + struct drm_xe_sync exec_syncs[] = { + { + .type = DRM_XE_SYNC_TYPE_SYNCOBJ, + .handle = sync_handles[0], + }, + { + .type = DRM_XE_SYNC_TYPE_SYNCOBJ, + .handle = sync_handles[1], + .flags = DRM_XE_SYNC_FLAG_SIGNAL, + } + }; + + struct drm_xe_exec exec = { + .exec_queue_id = ec->xe.queue_id, + .num_batch_buffer = 1, + .address = ec->batch_start, + .num_syncs = 2, + .syncs = (uintptr_t)exec_syncs, + }; + err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC, &exec); + if (err) + failf("xe_exec"); + + struct drm_syncobj_wait wait = { + .count_handles = 1, + .handles = (uintptr_t)&sync_handles[1], + .timeout_nsec = INT64_MAX, + }; + err = intel_ioctl(ec->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait); + if (err) + failf("syncobj_wait"); + } +} + +static void +executor_context_teardown(executor_context *ec) +{ + executor_destroy_bo(ec, &ec->bo.batch); + executor_destroy_bo(ec, &ec->bo.extra); + executor_destroy_bo(ec, &ec->bo.data); + + if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) { + struct drm_i915_gem_context_destroy destroy = { + .ctx_id = ec->i915.ctx_id, + }; + int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy); + if (err) + failf("i915_gem_context_destroy"); + } else { + assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE); + + struct drm_xe_exec_queue_destroy queue_destroy = { + .exec_queue_id = ec->xe.queue_id, + }; + int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC_QUEUE_DESTROY, &queue_destroy); + if (err) + failf("xe_exec_queue_destroy"); + + struct drm_xe_vm_destroy destroy = { + .vm_id = ec->xe.vm_id, + }; + err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_DESTROY, &destroy); + if (err) + failf("xe_vm_destroy"); + } +} + +static int +l_execute(lua_State *L) +{ + executor_context ec = { + .mem_ctx = ralloc_context(NULL), + .devinfo = &E.devinfo, + .isl_dev = &E.isl_dev, + .fd = E.fd, + }; + + executor_context_setup(&ec); + + executor_params params = {0}; + + { + if (lua_gettop(L) != 1) + failf("execute() must have a single table argument"); + + parse_execute_args(&ec, L, ¶ms); + + const char *src = executor_apply_macros(&ec, params.original_src); + + FILE *f = fmemopen((void *)src, strlen(src), "r"); + brw_assemble_result asm = brw_assemble(ec.mem_ctx, ec.devinfo, f, "", 0); + fclose(f); + + if (INTEL_DEBUG(DEBUG_CS) || !asm.bin) { + printf("=== Processed assembly source ===\n" + "%s" + "=================================\n\n", src); + } + + if (!asm.bin) + failf("assembler failure"); + + params.kernel_bin = asm.bin; + params.kernel_size = asm.bin_size; + } + + genX_call(emit_execute, &ec, ¶ms); + + if (INTEL_DEBUG(DEBUG_BATCH)) { + struct intel_batch_decode_ctx decoder; + enum intel_batch_decode_flags flags = INTEL_BATCH_DECODE_DEFAULT_FLAGS; + if (INTEL_DEBUG(DEBUG_COLOR)) + flags |= INTEL_BATCH_DECODE_IN_COLOR; + + intel_batch_decode_ctx_init_brw(&decoder, &E.isa, &E.devinfo, stdout, + flags, NULL, decode_get_bo, decode_get_state_size, &ec); + + assert(ec.bo.batch.cursor > ec.bo.batch.map); + const int batch_offset = ec.batch_start - ec.bo.batch.addr; + const int batch_size = (ec.bo.batch.cursor - ec.bo.batch.map) - batch_offset; + assert(batch_offset < batch_size); + + intel_print_batch(&decoder, ec.bo.batch.map, batch_size, ec.batch_start, false); + + intel_batch_decode_ctx_finish(&decoder); + } + + executor_context_dispatch(&ec); + + { + /* TODO: Use userdata to return a wrapped C array instead of building + * values. Could make integration with array operations better. + */ + uint32_t *data = ec.bo.data.map; + const int n = ec.bo.data.size / 4; + lua_createtable(L, n, 0); + for (int i = 0; i < 8; i++) { + lua_pushinteger(L, data[i]); + lua_seti(L, -2, i); + } + } + + executor_context_teardown(&ec); + ralloc_free(ec.mem_ctx); + + return 1; +} + +static int +l_dump(lua_State *L) +{ + /* TODO: Use a table to add options for the dump, e.g. + * starting offset, format, etc. + */ + + assert(lua_type(L, 1) == LUA_TTABLE); + assert(lua_type(L, 2) == LUA_TNUMBER); + assert(lua_isinteger(L, 2)); + + lua_Integer len_ = lua_tointeger(L, 2); + assert(len_ >= 0 && len_ <= INT_MAX); + int len = len_; + + int i; + for (i = 0; i < len; i++) { + if (i%8 == 0) printf("[0x%08x]", i * 4); + lua_rawgeti(L, 1, i); + lua_Integer val = lua_tointeger(L, -1); + printf(" 0x%08x", (uint32_t)val); + lua_pop(L, 1); + if (i%8 == 7) printf("\n"); + } + if (i%8 != 0) printf("\n"); + return 0; +} + +static int +l_check_ver(lua_State *L) +{ + int top = lua_gettop(L); + for (int i = 1; i <= top; i++) { + lua_Integer v = luaL_checknumber(L, i); + if (E.devinfo.ver == v) { + return 0; + } + } + failf("script doesn't support version=%d verx10=%d\n", + E.devinfo.ver, E.devinfo.verx10); + return 0; +} + +static int +l_check_verx10(lua_State *L) +{ + int top = lua_gettop(L); + for (int i = 1; i <= top; i++) { + lua_Integer v = luaL_checknumber(L, i); + if (E.devinfo.verx10 == v) { + return 0; + } + } + failf("script doesn't support version=%d verx10=%d\n", + E.devinfo.ver, E.devinfo.verx10); + return 0; +} + +/* TODO: Review numeric limits in the code, specially around Lua integer + * conversion. + */ + +int +main(int argc, char *argv[]) +{ + if (argc < 2 || + !strcmp(argv[1], "--help") || + !strcmp(argv[1], "-help") || + !strcmp(argv[1], "-h") || + !strcmp(argv[1], "help")) { + print_help(); + return 0; + } + + if (argc > 2) { + /* TODO: Expose extra arguments to the script as a variable. */ + failf("invalid extra arguments\nusage: executor FILENAME"); + return 1; + } + + process_intel_debug_variable(); + + E.fd = get_drm_device(&E.devinfo); + isl_device_init(&E.isl_dev, &E.devinfo); + brw_init_isa_info(&E.isa, &E.devinfo); + assert(E.devinfo.kmd_type == INTEL_KMD_TYPE_I915 || + E.devinfo.kmd_type == INTEL_KMD_TYPE_XE); + + lua_State *L = luaL_newstate(); + + /* TODO: Could be nice to export some kind of builder interface, + * maybe even let the script construct a shader at the BRW IR + * level and let the later passes kick in. + */ + + luaL_openlibs(L); + + lua_pushinteger(L, E.devinfo.ver); + lua_setglobal(L, "ver"); + + lua_pushinteger(L, E.devinfo.verx10); + lua_setglobal(L, "verx10"); + + lua_pushcfunction(L, l_execute); + lua_setglobal(L, "execute"); + + lua_pushcfunction(L, l_dump); + lua_setglobal(L, "dump"); + + lua_pushcfunction(L, l_check_ver); + lua_setglobal(L, "check_ver"); + + lua_pushcfunction(L, l_check_verx10); + lua_setglobal(L, "check_verx10"); + + const char *filename = argv[1]; + int err = luaL_loadfile(L, filename); + if (err) + failf("failed to load script: %s", lua_tostring(L, -1)); + + err = lua_pcall(L, 0, 0, 0); + if (err) + failf("failed to run script: %s", lua_tostring(L, -1)); + + lua_close(L); + close(E.fd); + + return 0; +} + +void +failf(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "ERROR: "); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); + exit(1); +} diff --git a/src/intel/executor/meson.build b/src/intel/executor/meson.build new file mode 100644 index 00000000000..8de7aafc21f --- /dev/null +++ b/src/intel/executor/meson.build @@ -0,0 +1,58 @@ +# Copyright © 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +if not dep_lua.found() + subdir_done() +endif + +executor_flags = [ + no_override_init_args, + sse2_args, +] + +executor_includes = [ + inc_include, + inc_src, + inc_intel, +] + +executor_hw_libs = [] +foreach v: ['90', '110', '120', '125', '200'] + executor_hw_libs += static_library( + 'executor_hw_ver@0@'.format(v), + ['executor_genx.c', gen_xml_pack], + include_directories: [executor_includes], + c_args: [ + executor_flags, + '-DGFX_VERx10=@0@'.format(v), + ], + gnu_symbol_visibility: 'hidden', + dependencies: [ + dep_valgrind, + idep_genxml, + ], + ) +endforeach + +executor = executable( + 'executor', + [ + 'executor_main.c', + 'executor_macros.c', + ], + dependencies: [ + dep_libdrm, + dep_lua, + dep_valgrind, + idep_brw_asm, + idep_genxml, + idep_intel_decoder_brw, + idep_intel_dev, + idep_libintel_common, + ], + include_directories: [executor_includes], + link_with: [executor_hw_libs], + c_args: [executor_flags], + gnu_symbol_visibility: 'hidden', + install: true +) diff --git a/src/intel/meson.build b/src/intel/meson.build index 117f9a1f5a7..41c48610ae1 100644 --- a/src/intel/meson.build +++ b/src/intel/meson.build @@ -23,6 +23,7 @@ if with_intel_hasvk or with_intel_vk or with_gallium_iris endif if with_intel_tools subdir('tools') + subdir('executor') endif if get_option('vulkan-layers').contains('intel-nullhw') subdir('nullhw-layer')