intel: Add executor tool

Add a tool that programs the hardware the minimum amount to be able to execute compute shaders and then executes a script that can perform data manipulation and dispatch execution of the shaders (written in Xe assembly). The goal is to have a tool to experiment directly with certain assembly instructions and the shared units without having to instrument the drivers. To make more convenient to write assembly, a few macros (indicated by the @-symbol) will be processed into the full instruction. For example, the script ``` local r = execute { data={ [42] = 0x100 }, src=[[ @mov g1 42 @read g2 g1 @id g3 add(8) g4<1>UD g2<8,8,1>UD g3<8,8,1>UD { align1 @1 1Q }; @write g3 g4 @eot ]] } dump(r, 4) ``` produces ``` [0x00000000] 0x00000100 0x00000101 0x00000102 0x00000103 ``` There's a help message inside the code that describes the script environment and the macros for assembly sources. Acked-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30062>
2025-12-25 19:30:11 +01:00 · 2024-07-06 21:44:45 -07:00 · 2024-07-06 21:44:45 -07:00 · e72bf2d02f
commit e72bf2d02f
parent 6267585778
12 changed files with 1729 additions and 0 deletions
--- a/src/intel/executor/examples/bfi.lua
+++ b/src/intel/executor/examples/bfi.lua
@ -0,0 +1,41 @@
+-- BFI seems available on Gfx9, need to fix the emission code for that.
+check_verx10(110, 120, 125, 200)
+
+function BFI_simulation(a, b, c, d)
+  local width  = a & 0x1F
+  local offset = b & 0x1F
+  local mask   = ((1 << width) - 1) << offset
+  return ((c << offset) & mask) | (d & ~mask)
+end
+
+function BFI(a, b, c, d)
+  local r = execute {
+    data = { [0] = a, b, c, d },
+    src = [[
+      @id   g9
+      @mov  g11  0
+      @mov  g12  1
+      @mov  g13  2
+      @mov  g14  3
+
+      @read g1 g11
+      @read g2 g12
+      @read g3 g13
+      @read g4 g14
+
+      bfi1(8)  g5<1>UD  g1<8,8,1>UD  g2<8,8,1>UD               { align1 @1 1Q };
+      bfi2(8)  g6<1>UD  g5<8,8,1>UD  g3<8,8,1>UD  g4<8,8,1>UD  { align1 @1 1Q };
+
+      @write g9 g6
+      @eot
+    ]],
+  }
+  return r[0]
+end
+
+function Hex(v) return string.format("0x%08x", v) end
+
+local a, b, c, d = 12, 12, 0xAAAAAAAA, 0xBBBBBBBB
+
+print("calculated", Hex(BFI(a, b, c, d)))
+print("expected",   Hex(BFI_simulation(a, b, c, d)))
--- a/src/intel/executor/examples/dp4a.lua
+++ b/src/intel/executor/examples/dp4a.lua
@ -0,0 +1,41 @@
+--[[
+
+Execute the example from the Dot Product 4 Accumulate
+instruction as seen in the PRM.
+
+    mov (1) r1.0:d 0x0102037F:d
+    // (char4)(0x1,0x2,0x3,0x7F)
+    mov (1) r2.0:d 50:d
+    dp4a (1) r3.0:d r2:d r1:d r1:d
+    // r3.0 = 50 + (0x1*0x1 + 0x2*0x2 + 0x3*0x3 + 0x7F*0x7F)
+    // = 50 + (1 + 4 + 9 + 16129)
+    // = 16193
+
+--]]
+
+check_ver(12)
+
+function DP4A(a, b, c)
+  local r = c
+  for i = 1, 4 do
+    r = r + a[i] * b[i]
+  end
+  return r
+end
+
+local r = execute {
+  src = [[
+    @id   g9
+
+    @mov  g1  0x0102037F
+    @mov  g2  50
+
+    dp4a(8)  g3<1>UD  g2<8,8,1>UD  g1<8,8,1>UD  g1<8,8,1>UD  { align1 @1 1Q };
+
+    @write g9 g3
+    @eot
+  ]],
+}
+
+print("expected", DP4A({1,2,3,0x7F}, {1,2,3,0x7F}, 50))
+print("calculated", r[0])
--- a/src/intel/executor/examples/help_example.lua
+++ b/src/intel/executor/examples/help_example.lua
@ -0,0 +1,18 @@
+-- Example from the help message.
+
+local r = execute {
+  data={ [42] = 0x100 },
+  src=[[
+    @mov     g1      42
+    @read    g2      g1
+
+    @id      g3
+
+    add(8)   g4<1>UD  g2<8,8,1>UD  g3<8,8,1>UD  { align1 @1 1Q };
+
+    @write   g3       g4
+    @eot
+  ]]
+}
+
+dump(r, 4)
--- a/src/intel/executor/examples/nop.lua
+++ b/src/intel/executor/examples/nop.lua
@ -0,0 +1,6 @@
+execute {
+  src = [[
+    nop;
+    @eot
+  ]],
+}
--- a/src/intel/executor/examples/test.lua
+++ b/src/intel/executor/examples/test.lua
@ -0,0 +1,20 @@
+local data = {}
+for i = 0, 8-1 do
+  data[i] = i * 4
+end
+
+local r = execute {
+  data = data,
+  src = [[
+    @id    g1
+    @read  g3 g1
+
+    add(8) g3<1>UD  g3<8,8,1>UD  0x100UD  { align1 1Q };
+
+    @write g1 g3
+
+    @eot
+  ]],
+}
+
+dump(r, 8)
--- a/src/intel/executor/executor.h
+++ b/src/intel/executor/executor.h
@ -0,0 +1,94 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef EXECUTOR_H
+#define EXECUTOR_H
+
+#include <stdint.h>
+
+#include "intel/dev/intel_device_info.h"
+#include "intel/isl/isl.h"
+
+typedef struct {
+   uint32_t size;
+   uint32_t handle;
+   void *map;
+   void *cursor;
+   uint64_t addr;
+} executor_bo;
+
+typedef struct {
+   void *mem_ctx;
+
+   struct intel_device_info *devinfo;
+   struct isl_device *isl_dev;
+   int fd;
+
+   struct {
+      uint32_t ctx_id;
+   } i915;
+
+   struct {
+      uint32_t vm_id;
+      uint32_t queue_id;
+   } xe;
+
+   struct {
+      executor_bo batch;
+      executor_bo extra;
+      executor_bo data;
+   } bo;
+
+   uint64_t batch_start;
+} executor_context;
+
+typedef struct {
+   const char *original_src;
+
+   void *kernel_bin;
+   uint32_t kernel_size;
+} executor_params;
+
+typedef struct {
+   uint64_t offset;
+} executor_address;
+
+__attribute__((unused)) static uint64_t
+executor_combine_address(void *data, void *location,
+                         executor_address address, uint32_t delta)
+{
+   return address.offset + delta;
+}
+
+executor_address executor_address_of_ptr(executor_bo *bo, void *ptr);
+
+void *executor_alloc_bytes(executor_bo *bo, uint32_t size);
+void *executor_alloc_bytes_aligned(executor_bo *bo, uint32_t size, uint32_t alignment);
+
+void failf(const char *fmt, ...) PRINTFLIKE(1, 2);
+
+const char *executor_apply_macros(executor_context *ec, const char *original_src);
+
+#ifdef genX
+#  include "executor_genx.h"
+#else
+#  define genX(x) gfx9_##x
+#  include "executor_genx.h"
+#  undef genX
+#  define genX(x) gfx11_##x
+#  include "executor_genx.h"
+#  undef genX
+#  define genX(x) gfx12_##x
+#  include "executor_genx.h"
+#  undef genX
+#  define genX(x) gfx125_##x
+#  include "executor_genx.h"
+#  undef genX
+#  define genX(x) gfx20_##x
+#  include "executor_genx.h"
+#  undef genX
+#endif
+
+#endif /* EXECUTOR_H */
--- a/src/intel/executor/executor_genx.c
+++ b/src/intel/executor/executor_genx.c
@ -0,0 +1,183 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "executor.h"
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x) ((void)0)
+#endif
+
+#define __gen_address_type executor_address
+#define __gen_combine_address executor_combine_address
+#define __gen_user_data void
+
+#include "intel/genxml/gen_macros.h"
+#include "intel/genxml/genX_pack.h"
+
+#define __executor_cmd_length(cmd) cmd ## _length
+#define __executor_cmd_header(cmd) cmd ## _header
+#define __executor_cmd_pack(cmd) cmd ## _pack
+
+#define executor_batch_emit(cmd, name)                                               \
+   for (struct cmd name = { __executor_cmd_header(cmd) },                            \
+        *_dst = executor_alloc_bytes(&ec->bo.batch, __executor_cmd_length(cmd) * 4); \
+        __builtin_expect(_dst != NULL, 1);                                           \
+        ({ __executor_cmd_pack(cmd)(0, _dst, &name);                                 \
+           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __executor_cmd_length(cmd) * 4));  \
+           _dst = NULL;                                                              \
+         }))
+
+static void
+emit_pipe_control(executor_context *ec)
+{
+   executor_batch_emit(GENX(PIPE_CONTROL), pc) {
+#if GFX_VER >= 12
+      pc.HDCPipelineFlushEnable     = true;
+#endif
+      pc.PipeControlFlushEnable     = true;
+      pc.CommandStreamerStallEnable = true;
+   }
+}
+
+static void
+emit_state_base_address(executor_context *ec, uint32_t mocs)
+{
+   /* Use the full address for everything. */
+   const executor_address base_address = {0};
+   const uint32_t size                 = (1 << 20) - 1;
+
+   executor_batch_emit(GENX(STATE_BASE_ADDRESS), sba) {
+      sba.GeneralStateBaseAddress               = base_address;
+      sba.GeneralStateBaseAddressModifyEnable   = true;
+      sba.GeneralStateBufferSize                = size;
+      sba.GeneralStateBufferSizeModifyEnable    = true;
+      sba.GeneralStateMOCS                      = mocs;
+
+      sba.DynamicStateBaseAddress               = base_address;
+      sba.DynamicStateBaseAddressModifyEnable   = true;
+      sba.DynamicStateBufferSize                = size;
+      sba.DynamicStateBufferSizeModifyEnable    = true;
+      sba.DynamicStateMOCS                      = mocs;
+
+      sba.InstructionBaseAddress                = base_address;
+      sba.InstructionBaseAddressModifyEnable    = true;
+      sba.InstructionBufferSize                 = size;
+      sba.InstructionBuffersizeModifyEnable     = true;
+      sba.InstructionMOCS                       = mocs;
+
+      sba.IndirectObjectBaseAddress             = base_address;
+      sba.IndirectObjectBaseAddressModifyEnable = true;
+      sba.IndirectObjectBufferSize              = size;
+      sba.IndirectObjectBufferSizeModifyEnable  = true;
+      sba.IndirectObjectMOCS                    = mocs;
+
+      sba.SurfaceStateMOCS            = mocs;
+      sba.StatelessDataPortAccessMOCS = mocs;
+
+#if GFX_VER >= 11
+      sba.BindlessSamplerStateMOCS    = mocs;
+#endif
+      sba.BindlessSurfaceStateMOCS    = mocs;
+
+#if GFX_VERx10 >= 125
+      sba.L1CacheControl = L1CC_WB;
+#endif
+   };
+}
+
+void
+genX(emit_execute)(executor_context *ec, const executor_params *params)
+{
+   uint32_t *kernel = executor_alloc_bytes(&ec->bo.extra, params->kernel_size);
+   memcpy(kernel, params->kernel_bin, params->kernel_size);
+   executor_address kernel_addr = executor_address_of_ptr(&ec->bo.extra, kernel);
+
+   /* TODO: Let SIMD be a parameter. */
+
+   struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
+      .KernelStartPointer = kernel_addr.offset,
+      .NumberofThreadsinGPGPUThreadGroup = 1,
+   };
+
+   void *b = executor_alloc_bytes_aligned(&ec->bo.batch, 0, 256);
+   ec->batch_start = executor_address_of_ptr(&ec->bo.batch, b).offset;
+
+   emit_pipe_control(ec);
+
+#if GFX_VERx10 < 200
+   executor_batch_emit(GENX(PIPELINE_SELECT), ps) {
+      ps.PipelineSelection = GPGPU;
+      ps.MaskBits = 0x3;
+   }
+   emit_pipe_control(ec);
+#endif
+
+   const uint32_t mocs = isl_mocs(ec->isl_dev, 0, false);
+
+   emit_state_base_address(ec, mocs);
+
+#if GFX_VERx10 >= 125
+   executor_batch_emit(GENX(STATE_COMPUTE_MODE), cm) {
+      cm.Mask1 = 0xffff;
+#if GFX_VERx10 >= 200
+      cm.Mask2 = 0xffff;
+#endif
+   }
+
+   executor_batch_emit(GENX(CFE_STATE), cfe) {
+      cfe.MaximumNumberofThreads = 64;
+   }
+#else
+   executor_batch_emit(GENX(MEDIA_VFE_STATE), vfe) {
+      vfe.NumberofURBEntries = 2;
+      vfe.MaximumNumberofThreads = 64;
+   }
+#endif
+
+   emit_pipe_control(ec);
+
+#if GFX_VERx10 >= 125
+   executor_batch_emit(GENX(COMPUTE_WALKER), cw) {
+#if GFX_VERx10 >= 200
+      cw.SIMDSize                = 1;
+      cw.MessageSIMD             = 1;
+#endif
+      cw.ThreadGroupIDXDimension = 1;
+      cw.ThreadGroupIDYDimension = 1;
+      cw.ThreadGroupIDZDimension = 1;
+      cw.ExecutionMask           = 0xFFFFFFFF;
+      cw.PostSync.MOCS           = mocs;
+      cw.InterfaceDescriptor     = desc;
+   };
+#else
+   uint32_t *idd = executor_alloc_bytes_aligned(&ec->bo.extra, 8 * 4, 256);
+   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, idd, &desc);
+
+   executor_address idd_addr = executor_address_of_ptr(&ec->bo.extra, idd);
+
+   executor_batch_emit(GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
+      load.InterfaceDescriptorDataStartAddress = idd_addr.offset,
+      load.InterfaceDescriptorTotalLength = 8 * 4;
+   }
+
+   executor_batch_emit(GENX(GPGPU_WALKER), gw) {
+      gw.ThreadGroupIDXDimension = 1;
+      gw.ThreadGroupIDYDimension = 1;
+      gw.ThreadGroupIDZDimension = 1;
+      gw.RightExecutionMask      = 0xFFFFFFFF;
+      gw.BottomExecutionMask     = 0xFFFFFFFF;
+   }
+
+   executor_batch_emit(GENX(MEDIA_STATE_FLUSH), msf);
+#endif
+
+   emit_pipe_control(ec);
+
+   executor_batch_emit(GENX(MI_BATCH_BUFFER_END), end);
+}
--- a/src/intel/executor/executor_genx.h
+++ b/src/intel/executor/executor_genx.h
@ -0,0 +1,10 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef EXECUTOR_H
+#error This file must be included via executor.h
+#endif
+
+void genX(emit_execute)(executor_context *ec, const executor_params *params);
--- a/src/intel/executor/executor_macros.c
+++ b/src/intel/executor/executor_macros.c
@ -0,0 +1,407 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ctype.h>
+
+#include "util/ralloc.h"
+#include "intel/compiler/brw_asm.h"
+
+#include "executor.h"
+
+static bool
+startswith(const char *prefix, const char *s)
+{
+   return !strncmp(prefix, s, strlen(prefix));
+}
+
+static char *
+skip_prefix(char *prefix, char *start)
+{
+   assert(startswith(prefix, start));
+   char *c = start += strlen(prefix);
+   return c;
+}
+
+typedef struct {
+   char **args;
+   int    count;
+} parse_args_result;
+
+static parse_args_result
+parse_args(void *mem_ctx, char *c)
+{
+   parse_args_result r = {0};
+
+   while (*c) {
+      /* Skip spaces. */
+      while (*c && isspace(*c))
+         c++;
+      if (!*c)
+         break;
+
+      /* Copy non-spaces. */
+      char *start = c;
+      while (*c && !isspace(*c))
+         c++;
+      r.args = reralloc_array_size(mem_ctx, r.args, sizeof(char *), r.count + 1);
+      r.args[r.count++] = ralloc_strndup(mem_ctx, start, c - start);
+   }
+
+   return r;
+}
+
+static void
+executor_macro_mov(executor_context *ec, char **src, char *line)
+{
+   char *c = skip_prefix("@mov", line);
+   parse_args_result r = parse_args(ec->mem_ctx, c);
+
+   if (r.count != 2)
+      failf("@mov needs 2 arguments, found %d\n", r.count);
+
+   const char *reg = r.args[0];
+   char *value     = r.args[1];
+
+   if (strchr(value, '.')) {
+      union {
+         float f;
+         uint32_t u;
+      } val;
+
+      val.f = strtof(value, NULL);
+
+      switch (ec->devinfo->verx10) {
+      case 90:
+      case 110:
+      case 120:
+      case 125: {
+         ralloc_asprintf_append(src, "mov(8) %s<1>F 0x%08xF /* %f */ { align1 1Q };\n", reg, val.u, val.f);
+         break;
+      }
+      case 200: {
+         ralloc_asprintf_append(src, "mov(16) %s<1>F 0x%08xF /* %f */ { align1 1H };\n", reg, val.u, val.f);
+         break;
+      }
+      default:
+         unreachable("invalid gfx version");
+      }
+
+   } else {
+      for (char *c = value; *c; c++)
+         *c = tolower(*c);
+      switch (ec->devinfo->verx10) {
+      case 90:
+      case 110:
+      case 120:
+      case 125: {
+         ralloc_asprintf_append(src, "mov(8) %s<1>UD %sUD { align1 1Q };\n", reg, value);
+         break;
+      }
+
+      case 200: {
+         ralloc_asprintf_append(src, "mov(16) %s<1>UD %sUD { align1 1H };\n", reg, value);
+         break;
+      }
+
+      default:
+         unreachable("invalid gfx version");
+      }
+   }
+}
+
+static void
+executor_macro_syncnop(executor_context *ec, char **src, char *line)
+{
+   switch (ec->devinfo->verx10) {
+   case 90:
+   case 110: {
+      /* Not needed. */
+      break;
+   }
+
+   case 120: {
+      ralloc_strcat(src, "sync nop(8)  null<0,1,0>UD  { align1 WE_all 1H @1 $1 };\n");
+      break;
+   }
+
+   case 125:
+   case 200: {
+      ralloc_strcat(src, "sync nop(8)  null<0,1,0>UD  { align1 WE_all 1H A@1 $1 };\n");
+      break;
+   }
+
+   default:
+      unreachable("invalid gfx version");
+   }
+}
+
+static void
+executor_macro_eot(executor_context *ec, char **src, char *line)
+{
+   switch (ec->devinfo->verx10) {
+   case 90:
+   case 110: {
+      ralloc_strcat(src,
+         "mov(8)          g127<1>UD  g0<8;8,1>UD    { align1 WE_all 1Q };\n"
+         "send(8)         null<1>UW  g127<0,1,0>UD  0x82000010\n"
+         "    thread_spawner MsgDesc: mlen 1 rlen 0 { align1 WE_all 1Q EOT };\n");
+      break;
+   }
+   case 120: {
+      ralloc_strcat(src,
+         "mov(8)          g127<1>UD  g0<8;8,1>UD  { align1 WE_all 1Q };\n"
+         "send(8)         nullUD     g127UD       nullUD  0x02000000  0x00000000\n"
+         "    thread_spawner MsgDesc:  mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q @1 EOT };\n");
+      break;
+   }
+
+   case 125: {
+      ralloc_strcat(src,
+         "mov(8)         g127<1>UD  g0<8;8,1>UD  { align1 WE_all 1Q };\n"
+         "send(8)        nullUD     g127UD       nullUD  0x02000000  0x00000000\n"
+         "    gateway MsgDesc: (open)  mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q A@1 EOT };\n");
+         break;
+   }
+
+   case 200: {
+      ralloc_strcat(src,
+         "mov(16)         g127<1>UD  g0<1,1,0>UD  { align1 WE_all 1H };\n"
+         "send(16)        nullUD     g127UD       nullUD  0x02000000  0x00000000\n"
+         "    gateway MsgDesc: (open)  mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1H I@1 EOT };\n");
+         break;
+   }
+   default:
+      unreachable("invalid gfx version");
+   }
+}
+
+static void
+executor_macro_id(executor_context *ec, char **src, char *line)
+{
+   char *c = skip_prefix("@id", line);
+   parse_args_result r = parse_args(ec->mem_ctx, c);
+
+   if (r.count != 1)
+      failf("@id needs 1 argument, found %d\n", r.count);
+
+   const char *reg = r.args[0];
+
+   switch (ec->devinfo->verx10) {
+   case 90:
+   case 110:
+   case 120:
+   case 125: {
+      ralloc_asprintf_append(src,
+         "mov(8)  g127<1>UW  0x76543210V    { align1 WE_all 1Q };\n"
+         "mov(8)  %s<1>UD    g127<8,8,1>UW  { align1 WE_all 1Q @1 };\n", reg);
+      break;
+   }
+
+   case 200: {
+      ralloc_asprintf_append(src,
+         "mov(8)  g127<1>UW    0x76543210V         { align1 WE_all 1Q };\n"
+         "add(8)  g127.8<1>UW  g127<1,1,0>UW  8UW  { align1 WE_all 1Q @1 };\n"
+         "mov(16) %s<1>UD      g127<8,8,1>UW       { align1 WE_all 1Q @1 };\n", reg);
+      break;
+   }
+
+   default:
+      unreachable("invalid gfx version");
+   }
+}
+
+static void
+executor_macro_write(executor_context *ec, char **src, char *line)
+{
+   char *c = skip_prefix("@write", line);
+   parse_args_result r = parse_args(ec->mem_ctx, c);
+
+   if (r.count != 2)
+      failf("@write needs 2 arguments, found %d\n", r.count);
+
+   const char *offset_reg = r.args[0];
+   const char *data_reg   = r.args[1];
+
+   assert(ec->bo.data.addr <= 0xFFFFFFFF);
+   uint32_t base_addr = ec->bo.data.addr;
+
+   switch (ec->devinfo->verx10) {
+   case 90:
+   case 110:
+   case 120: {
+      const char *send_suffix = ec->devinfo->verx10 < 120 ? "s" : "";
+      ralloc_asprintf_append(src,
+         "mul(8)          g127<1>UD  %s<8;8,1>UD    0x4UW     { align1 @1 1Q };\n"
+         "add(8)          g127<1>UD  g127<8;8,1>UD  0x%08xUD  { align1 @1 1Q };\n"
+         "send%s(8)       nullUD     g127UD         %sUD      0x2026efd   0x00000040\n"
+         "    dp data 1 MsgDesc: (DC untyped surface write, Surface = 253, "
+         "                        SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 "
+         "    { align1 1Q @1 $1 };\n",
+         offset_reg, base_addr, send_suffix, data_reg);
+      executor_macro_syncnop(ec, src, "@syncnop");
+      break;
+   }
+
+   case 125: {
+      ralloc_asprintf_append(src,
+         "mul(8)          g127<1>UD  %s<1;1,0>UD    0x4UW     { align1 @1 1Q };\n"
+         "add(8)          g127<1>UD  g127<1;1,0>UD  0x%08xUD  { align1 @1 1Q };\n"
+         "send(8)         nullUD     g127UD         %sUD      0x02000504 0x00000040\n"
+         "    ugm MsgDesc: ( store, a32, d32, x, L1STATE_L3MOCS dst_len = 0, "
+         "                   src0_len = 1, src1_len = 1, flat )  base_offset 0 "
+         "    { align1 1Q A@1 $1 };\n",
+         offset_reg, base_addr, data_reg);
+      executor_macro_syncnop(ec, src, "@syncnop");
+      break;
+   }
+
+   case 200: {
+      ralloc_asprintf_append(src,
+         "mul(16)          g127<1>UD  %s<1;1,0>UD    0x4UW     { align1 @1 1Q };\n"
+         "add(16)          g127<1>UD  g127<1;1,0>UD  0x%08xUD  { align1 @1 1Q };\n"
+         "send(16)         nullUD     g127UD         %sUD      0x02000504 0x00000040\n"
+         "    ugm MsgDesc: ( store, a32, d32, x, L1STATE_L3MOCS dst_len = 0, "
+         "                   src0_len = 1, src1_len = 1, flat ) base_offset 0  "
+         "    { align1 1Q A@1 $1 };\n",
+         offset_reg, base_addr, data_reg);
+      executor_macro_syncnop(ec, src, "@syncnop");
+      break;
+   }
+
+   default:
+      unreachable("invalid gfx version");
+   }
+}
+
+static void
+executor_macro_read(executor_context *ec, char **src, char *line)
+{
+   char *c = skip_prefix("@read", line);
+   parse_args_result r = parse_args(ec->mem_ctx, c);
+
+   if (r.count != 2)
+      failf("@read needs 2 arguments, found %d\n", r.count);
+
+   /* Order follows underlying SEND, destination first. */
+   const char *data_reg   = r.args[0];
+   const char *offset_reg = r.args[1];
+
+   assert(ec->bo.data.addr <= 0xFFFFFFFF);
+   uint32_t base_addr = ec->bo.data.addr;
+
+   switch (ec->devinfo->verx10) {
+   case 90:
+   case 110:
+   case 120: {
+      const char *send_suffix = ec->devinfo->verx10 < 120 ? "s" : "";
+      ralloc_asprintf_append(src,
+         "mul(8)          g127<1>UD  %s<8;8,1>UD    0x4UW     { align1 @1 1Q };\n"
+         "add(8)          g127<1>UD  g127<8;8,1>UD  0x%08xUD  { align1 @1 1Q };\n"
+         "send%s(8)       %sUD       g127UD         nullUD    0x2106efd   0x00000000\n"
+         "    dp data 1 MsgDesc: (DC untyped surface read, Surface = 253, "
+         "                        SIMD8, Mask = 0xe) mlen 1 ex_mlen 0 rlen 1 "
+         "    { align1 1Q @1 $1 };\n",
+         offset_reg, base_addr, send_suffix, data_reg);
+      executor_macro_syncnop(ec, src, "@syncnop");
+      break;
+   }
+
+   case 125: {
+      ralloc_asprintf_append(src,
+         "mul(8)          g127<1>UD  %s<1;1,0>UD    0x4UW     { align1 @1 1Q };\n"
+         "add(8)          g127<1>UD  g127<1;1,0>UD  0x%08xUD  { align1 @1 1Q };\n"
+         "send(8)         %sUD       g127UD         nullUD    0x02100500 0x00000000\n"
+         "    ugm MsgDesc: ( load, a32, d32, x, L1STATE_L3MOCS dst_len = 1, "
+         "                   src0_len = 1, flat ) src1_len = 0  base_offset 0 "
+         "    { align1 1Q A@1 $1 };\n",
+         offset_reg, base_addr, data_reg);
+      executor_macro_syncnop(ec, src, "@syncnop");
+      break;
+   }
+
+   case 200: {
+      ralloc_asprintf_append(src,
+         "mul(8)          g127<1>UD  %s<1;1,0>UD    0x4UW     { align1 @1 1Q };\n"
+         "add(8)          g127<1>UD  g127<1;1,0>UD  0x%08xUD  { align1 @1 1Q };\n"
+         "send(8)         %sUD       g127UD         nullUD    0x02100500 0x00000000\n"
+         "    ugm MsgDesc: ( load, a32, d32, x, L1STATE_L3MOCS dst_len = 1, "
+         "                   src0_len = 1, flat ) src1_len = 0  base_offset 0 "
+         "    { align1 1Q A@1 $1 };\n",
+         offset_reg, base_addr, data_reg);
+      executor_macro_syncnop(ec, src, "@syncnop");
+      break;
+   }
+
+   default:
+      unreachable("invalid gfx version");
+   }
+}
+
+static char *
+find_macro_symbol(char *line)
+{
+   char *c = line;
+   while (isspace(*c)) c++;
+   return *c == '@' ? c : NULL;
+}
+
+static bool
+match_macro_name(const char *name, const char *line)
+{
+   if (!startswith(name, line))
+      return false;
+   line += strlen(name);
+   return !*line || isspace(*line);
+}
+
+const char *
+executor_apply_macros(executor_context *ec, const char *original_src)
+{
+   char *scratch = ralloc_strdup(ec->mem_ctx, original_src);
+
+   /* Create a ralloc'ed empty string so can call append to it later. */
+   char *src = ralloc_strdup(ec->mem_ctx, "");
+
+   /* TODO: Create a @send macro for common combinations of MsgDesc. */
+   static const struct {
+      const char *name;
+      void (*func)(executor_context *ec, char **output, char *line);
+   } macros[] = {
+      { "@eot",      executor_macro_eot },
+      { "@mov",      executor_macro_mov },
+      { "@write",    executor_macro_write },
+      { "@read",     executor_macro_read },
+      { "@id",       executor_macro_id },
+      { "@syncnop",  executor_macro_syncnop },
+   };
+
+   char *next = scratch;
+   while (next) {
+      char *line = next;
+      char *end = line;
+
+      while (*end && *end != '\n') end++;
+      next = *end ? end + 1 : NULL;
+      *end = '\0';
+
+      char *macro = find_macro_symbol(line);
+      if (!macro) {
+         ralloc_asprintf_append(&src, "%s\n", line);
+      } else {
+         bool found = false;
+         for (int i = 0; i < ARRAY_SIZE(macros); i++) {
+            if (match_macro_name(macros[i].name, macro)) {
+               macros[i].func(ec, &src, macro);
+               found = true;
+               break;
+            }
+         }
+         if (!found)
+            failf("unsupported macro line: %s", macro);
+      }
+   }
+
+   return src;
+}
--- a/src/intel/executor/executor_main.c
+++ b/src/intel/executor/executor_main.c
@ -0,0 +1,850 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <lua.h>
+#include <lualib.h>
+#include <lauxlib.h>
+
+#include "util/ralloc.h"
+
+#include <xf86drm.h>
+#include "drm-uapi/i915_drm.h"
+#include "drm-uapi/xe_drm.h"
+
+#include "intel/compiler/brw_asm.h"
+#include "intel/compiler/brw_isa_info.h"
+#include "intel/common/intel_gem.h"
+#include "intel/common/xe/intel_engine.h"
+#include "intel/decoder/intel_decoder.h"
+#include "intel/dev/intel_debug.h"
+
+#include "executor.h"
+
+enum {
+   /* Predictable base addresses here make it easier to spot errors. */
+   EXECUTOR_BO_BATCH_ADDR = 0x10000000,
+   EXECUTOR_BO_EXTRA_ADDR = 0x20000000,
+   EXECUTOR_BO_DATA_ADDR  = 0x30000000,
+
+   /* Apply to all BOs. */
+   EXECUTOR_BO_SIZE = 10 * 1024 * 1024,
+};
+
+static void
+print_help()
+{
+   printf(
+      "Executes shaders written for Intel GPUs\n"
+      "usage: executor FILENAME\n"
+      "\n"
+      "The input is a Lua script that can perform data manipulation\n"
+      "and dispatch execution of compute shaders, written in Xe assembly,\n"
+      "the same format used by the brw_asm assembler or when dumping\n"
+      "shaders in debug mode.\n"
+      "\n"
+      "The goal is to have a tool to experiment directly with certain\n"
+      "assembly instructions and the shared units without having to\n"
+      "instrument the drivers.\n"
+      "\n"
+      "EXECUTION CONTEXT\n"
+      "\n"
+      "By default compute shaders are used with SIMD8 for Gfx9-125 and SIMD16\n"
+      "for Xe2.  Only a single thread is dispatched.  A data buffer is used to\n"
+      "pipe data into the shader and out of it, it is bound to the graphics\n"
+      "address 0x%08x.\n"
+      "\n"
+      "The Gfx versions have differences in their assembly and shared units, so\n"
+      "other than very simple examples, scripts for this program will be either\n"
+      "specific to a version or provide shader variants for multiple versions.\n"
+      "\n"
+      "ASSEMBLY MACROS\n"
+      "\n"
+      "In addition to regular instructions, the follow macros will generate\n"
+      "assembly code based on the Gfx version being executed.  Unlike in regular\n"
+      "instructions, REGs don't use regions and can't be immediates.\n"
+      "\n"
+      "- @eot\n"
+      "  Send an EOT message.\n"
+      "\n"
+      "- @mov REG IMM\n"
+      "  Like a regular MOV but accepts numbers in both decimal and\n"
+      "  floating-point.\n"
+      "\n"
+      "- @id REG\n"
+      "  Write a local invocation index into REG.\n"
+      "\n"
+      "- @read DST_REG OFFSET_REG\n"
+      "  Read 32-bit values from the memory buffer at OFFSET_REG into DST_REG.\n"
+      "\n"
+      "- @write OFFSET_REG SRC_REG\n"
+      "  Write 32-bit values from SRC_REG to the memory buffer at OFFSET_REG.\n"
+      "\n"
+      "- @syncnop\n"
+      "  Produce a coarse grained sync.nop (when applicable) to ensure data from\n"
+      "  macros above are read/written.\n"
+      "\n"
+      "LUA ENVIRONMENT\n"
+      "\n"
+      "In addition to the regular Lua standard library the following variables and.\n"
+      "functions are available.\n"
+      "\n"
+      "- execute({src=STR, data=ARRAY}) -> ARRAY\n"
+      "  Takes a table as argument.  The 'src' in the table contains the shader to be\n"
+      "  executed.  The 'data' argument will be used to fill the data buffer with 32-bit\n"
+      "  values.  The function returns an ARRAY with the contents of the data buffer\n"
+      "  after the shader completes.\n"
+      "\n"
+      "- dump(ARRAY, COUNT)\n"
+      "  Pretty print the COUNT first elements of an array of 32-bit values.\n"
+      "\n"
+      "- check_ver(V, ...), check_verx10(V, ...)\n"
+      "  Exit if the Gfx version being executed isn't in the arguments list.\n"
+      "\n"
+      "- ver, verx10\n"
+      "  Variables containing the Gfx version being executed.\n"
+      "\n"
+      "This program was compiled with %s.\n"
+      "\n"
+      "ENVIRONMENT VARIABLES\n"
+      "\n"
+      "The following INTEL_DEBUG values (comma separated) are used:\n"
+      "\n"
+      " - bat             Dumps the batch buffer.\n"
+      " - color           Uses colore for the above.\n"
+      " - cs              Dumps the assembly after macro processing.\n"
+      "\n"
+      "EXAMPLE\n"
+      "\n"
+      "The following script\n"
+      "\n"
+      "  local r = execute {\n"
+      "    data={ [42] = 0x100 },\n"
+      "    src=[[\n"
+      "      @mov     g1      42\n"
+      "      @read    g2      g1\n"
+      "\n"
+      "      @id      g3\n"
+      "\n"
+      "      add(8)   g4<1>UD  g2<8,8,1>UD  g3<8,8,1>UD  { align1 @1 1Q };\n"
+      "\n"
+      "      @write   g3       g4\n"
+      "    ]]\n"
+      "  }\n"
+      "\n"
+      "  dump(r, 4)\n"
+      "\n"
+      "Will produce the following output\n"
+      "\n"
+      "   [0x00000000] 0x00000100 0x00000101 0x00000102 0x00000103\n"
+      "\n"
+      "More examples can be found in the examples/ directory in the source code.\n"
+      "\n", EXECUTOR_BO_DATA_ADDR, LUA_RELEASE);
+}
+
+static struct {
+   struct intel_device_info devinfo;
+   struct isl_device isl_dev;
+   struct brw_isa_info isa;
+   int fd;
+} E;
+
+#define genX_call(func, ...)                                \
+   switch (E.devinfo.verx10) {                              \
+   case 90:  gfx9_  ##func(__VA_ARGS__); break;             \
+   case 110: gfx11_ ##func(__VA_ARGS__); break;             \
+   case 120: gfx12_ ##func(__VA_ARGS__); break;             \
+   case 125: gfx125_##func(__VA_ARGS__); break;             \
+   case 200: gfx20_ ##func(__VA_ARGS__); break;             \
+   default: unreachable("Unsupported hardware generation"); \
+   }
+
+static void
+executor_create_bo(executor_context *ec, executor_bo *bo, uint64_t addr, uint32_t size_in_bytes)
+{
+   if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
+      struct drm_i915_gem_create gem_create = {
+         .size = size_in_bytes,
+      };
+
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
+      if (err)
+         failf("i915_gem_create");
+
+      struct drm_i915_gem_mmap_offset mm = {
+         .handle = gem_create.handle,
+         .flags  = ec->devinfo->has_local_mem ? I915_MMAP_OFFSET_FIXED
+                                              : I915_MMAP_OFFSET_WC,
+      };
+
+      err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mm);
+      if (err)
+         failf("i915_gem_mmap_offset");
+
+      bo->handle = gem_create.handle;
+      bo->map    = mmap(NULL, size_in_bytes, PROT_READ | PROT_WRITE,
+                        MAP_SHARED, ec->fd, mm.offset);
+      if (!bo->map)
+         failf("mmap");
+   } else {
+      assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
+
+      struct drm_xe_gem_create gem_create = {
+         .size        = size_in_bytes,
+         .cpu_caching = DRM_XE_GEM_CPU_CACHING_WB,
+         .placement   = 1u << ec->devinfo->mem.sram.mem.instance,
+      };
+
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_GEM_CREATE, &gem_create);
+      if (err)
+         failf("xe_gem_create");
+
+      struct drm_xe_gem_mmap_offset mm = {
+         .handle = gem_create.handle,
+      };
+
+      err = intel_ioctl(ec->fd, DRM_IOCTL_XE_GEM_MMAP_OFFSET, &mm);
+      if (err)
+         failf("xe_gem_mmap_offset");
+
+      bo->handle = gem_create.handle;
+      bo->map    = mmap(NULL, size_in_bytes, PROT_READ | PROT_WRITE,
+                        MAP_SHARED, ec->fd, mm.offset);
+      if (!bo->map)
+         failf("mmap");
+   }
+
+   bo->size   = size_in_bytes;
+   bo->addr   = addr;
+   bo->cursor = bo->map;
+}
+
+static void
+executor_destroy_bo(executor_context *ec, executor_bo *bo)
+{
+   struct drm_gem_close gem_close = {
+      .handle = bo->handle,
+   };
+
+   int err = munmap(bo->map, bo->size);
+   if (err)
+      failf("munmap");
+
+   err = intel_ioctl(ec->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+   if (err)
+      failf("gem_close");
+
+   memset(bo, 0, sizeof(*bo));
+}
+
+static void
+executor_print_bo(executor_bo *bo, const char *name)
+{
+   assert((bo->cursor - bo->map) % 4 == 0);
+   uint32_t *dw = bo->map;
+   uint32_t len = (uint32_t *)bo->cursor - dw;
+
+   printf("=== %s (0x%08lx, %lu bytes) ===\n", name, bo->addr, bo->cursor - bo->map);
+
+   for (int i = 0; i < len; i++) {
+      if ((i % 8) == 0) printf("[0x%08x] ", (i*4) + (uint32_t)bo->addr);
+      printf("0x%08x ", dw[i]);
+      if ((i % 8) == 7) printf("\n");
+   }
+   printf("\n");
+}
+
+void *
+executor_alloc_bytes(executor_bo *bo, uint32_t size)
+{
+   return executor_alloc_bytes_aligned(bo, size, 0);
+}
+
+void *
+executor_alloc_bytes_aligned(executor_bo *bo, uint32_t size, uint32_t alignment)
+{
+   void *r = bo->cursor;
+   if (alignment) {
+      r = (void *)(((uintptr_t)r + alignment-1) & ~((uintptr_t)alignment-1));
+   }
+   bo->cursor = r + size;
+   return r;
+}
+
+executor_address
+executor_address_of_ptr(executor_bo *bo, void *ptr)
+{
+   return (executor_address){ptr - bo->map + bo->addr};
+}
+
+static int
+get_drm_device(struct intel_device_info *devinfo)
+{
+   drmDevicePtr devices[8];
+   int max_devices = drmGetDevices2(0, devices, 8);
+
+   int i, fd = -1;
+   for (i = 0; i < max_devices; i++) {
+      if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER &&
+          devices[i]->bustype == DRM_BUS_PCI &&
+          devices[i]->deviceinfo.pci->vendor_id == 0x8086) {
+         fd = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR | O_CLOEXEC);
+         if (fd < 0)
+            continue;
+
+         if (!intel_get_device_info_from_fd(fd, devinfo, -1, -1) ||
+             devinfo->ver < 8) {
+            close(fd);
+            fd = -1;
+            continue;
+         }
+
+         /* Found a device! */
+         break;
+      }
+   }
+
+   return fd;
+}
+
+static struct intel_batch_decode_bo
+decode_get_bo(void *_ec, bool ppgtt, uint64_t address)
+{
+   executor_context *ec = _ec;
+   struct intel_batch_decode_bo bo = {0};
+
+   if (address >= ec->bo.batch.addr && address < ec->bo.batch.addr + ec->bo.batch.size) {
+      bo.addr = ec->bo.batch.addr;
+      bo.size = ec->bo.batch.size;
+      bo.map  = ec->bo.batch.map;
+   } else if (address >= ec->bo.extra.addr && address < ec->bo.extra.addr + ec->bo.extra.size) {
+      bo.addr = ec->bo.extra.addr;
+      bo.size = ec->bo.extra.size;
+      bo.map  = ec->bo.extra.map;
+   } else if (address >= ec->bo.data.addr && address < ec->bo.data.addr + ec->bo.data.size) {
+      bo.addr = ec->bo.data.addr;
+      bo.size = ec->bo.data.size;
+      bo.map  = ec->bo.data.map;
+   }
+
+   return bo;
+}
+
+static unsigned
+decode_get_state_size(void *_ec, uint64_t address, uint64_t base_address)
+{
+   return EXECUTOR_BO_SIZE;
+}
+
+static void
+parse_execute_data(executor_context *ec, lua_State *L, int table_idx)
+{
+   uint32_t *data = ec->bo.data.map;
+
+   lua_pushvalue(L, table_idx);
+
+   lua_pushnil(L);
+   while (lua_next(L, -2) != 0) {
+      int val_idx = lua_gettop(L);
+      int key_idx = val_idx - 1;
+
+      if (lua_type(L, key_idx) != LUA_TNUMBER || !lua_isinteger(L, key_idx))
+         failf("invalid key for data in execute call");
+
+      lua_Integer key = lua_tointeger(L, key_idx);
+      assert(key <= 10 * 1024 * 1024 / 4);
+      lua_Integer val = lua_tointeger(L, val_idx);
+      data[key] = val;
+
+      lua_pop(L, 1);
+   }
+
+   lua_pop(L, 1);
+}
+
+static void
+parse_execute_args(executor_context *ec, lua_State *L, executor_params *params)
+{
+   int opts = lua_gettop(L);
+
+   lua_pushnil(L);
+
+   while (lua_next(L, opts) != 0) {
+      int val_idx = lua_gettop(L);
+      int key_idx = val_idx - 1;
+
+      if (lua_type(L, key_idx) != LUA_TSTRING) {
+         lua_pop(L, 1);
+         continue;
+      }
+
+      const char *key = lua_tostring(L, key_idx);
+
+      if (!strcmp(key, "src")) {
+         params->original_src = ralloc_strdup(ec->mem_ctx, luaL_checkstring(L, val_idx));
+      } else if (!strcmp(key, "data")) {
+         parse_execute_data(ec, L, val_idx);
+      } else {
+         failf("unknown parameter '%s' for execute()", key);
+      }
+
+      lua_pop(L, 1);
+   }
+}
+
+static void
+executor_context_setup(executor_context *ec)
+{
+   if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
+      struct drm_i915_gem_context_create create = {0};
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+      if (err)
+         failf("i915_gem_context_create");
+      ec->i915.ctx_id = create.ctx_id;
+   } else {
+      assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
+
+      struct drm_xe_vm_create create = {
+         .flags = DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE,
+      };
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_CREATE, &create);
+      if (err)
+         failf("xe_vm_create");
+      ec->xe.vm_id = create.vm_id;
+
+      struct drm_xe_engine_class_instance instance = {0};
+
+      struct intel_query_engine_info *engines_info = xe_engine_get_info(ec->fd);
+      assert(engines_info);
+
+      bool found_engine = false;
+      for (int i = 0; i < engines_info->num_engines; i++) {
+         struct intel_engine_class_instance *e = &engines_info->engines[i];
+         if (e->engine_class == INTEL_ENGINE_CLASS_RENDER) {
+            instance.engine_class = DRM_XE_ENGINE_CLASS_RENDER;
+            instance.engine_instance = e->engine_instance;
+            instance.gt_id = e->gt_id;
+            found_engine = true;
+            break;
+         }
+      }
+      assert(found_engine);
+
+      struct drm_xe_exec_queue_create queue_create = {
+         .vm_id          = ec->xe.vm_id,
+         .width          = 1,
+         .num_placements = 1,
+         .instances      = (uintptr_t)&instance,
+      };
+      err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &queue_create);
+      if (err)
+         failf("xe_exec_queue_create");
+      ec->xe.queue_id = queue_create.exec_queue_id;
+   }
+
+   executor_create_bo(ec, &ec->bo.batch, EXECUTOR_BO_BATCH_ADDR, EXECUTOR_BO_SIZE);
+   executor_create_bo(ec, &ec->bo.extra, EXECUTOR_BO_EXTRA_ADDR, EXECUTOR_BO_SIZE);
+   executor_create_bo(ec, &ec->bo.data,  EXECUTOR_BO_DATA_ADDR, EXECUTOR_BO_SIZE);
+
+   uint32_t *data = ec->bo.data.map;
+   for (int i = 0; i < EXECUTOR_BO_SIZE / 4; i++)
+      data[i] = 0xABABABAB;
+}
+
+static void
+executor_context_dispatch(executor_context *ec)
+{
+   if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
+      struct drm_i915_gem_exec_object2 objs[] = {
+         {
+            .handle = ec->bo.batch.handle,
+            .offset = ec->bo.batch.addr,
+            .flags  = EXEC_OBJECT_PINNED,
+         },
+         {
+            .handle = ec->bo.extra.handle,
+            .offset = ec->bo.extra.addr,
+            .flags  = EXEC_OBJECT_PINNED,
+         },
+         {
+            .handle = ec->bo.data.handle,
+            .offset = ec->bo.data.addr,
+            .flags  = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE,
+         },
+      };
+
+      struct drm_i915_gem_execbuffer2 exec = {0};
+      exec.buffers_ptr = (uintptr_t)objs;
+      exec.buffer_count = ARRAY_SIZE(objs);
+      exec.batch_start_offset = ec->batch_start - ec->bo.batch.addr;
+      exec.flags = I915_EXEC_BATCH_FIRST;
+      exec.rsvd1 = ec->i915.ctx_id;
+
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &exec);
+      if (err)
+          failf("i915_gem_execbuffer2");
+
+      struct drm_i915_gem_wait wait = {0};
+      wait.bo_handle = ec->bo.batch.handle;
+      wait.timeout_ns = INT64_MAX;
+
+      err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+      if (err)
+         failf("i915_gem_wait");
+   } else {
+      assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
+
+      /* First syncobj is signalled by the binding operation and waited by the
+       * execution of the batch buffer.
+       *
+       * Second syncobj is singalled by the execution of batch buffer and
+       * waited at the end.
+       */
+      uint32_t sync_handles[2] = {0};
+      for (int i = 0; i < 2; i++) {
+         struct drm_syncobj_create sync_create = {0};
+         int err = intel_ioctl(ec->fd, DRM_IOCTL_SYNCOBJ_CREATE, &sync_create);
+         if (err)
+            failf("syncobj_create");
+         sync_handles[i] = sync_create.handle;
+      }
+
+      struct drm_xe_vm_bind_op bind_ops[] = {
+         {
+            .op        = DRM_XE_VM_BIND_OP_MAP,
+            .obj       = ec->bo.batch.handle,
+            .addr      = ec->bo.batch.addr,
+            .range     = EXECUTOR_BO_SIZE,
+            .pat_index = ec->devinfo->pat.cached_coherent.index,
+         },
+         {
+            .op        = DRM_XE_VM_BIND_OP_MAP,
+            .obj       = ec->bo.extra.handle,
+            .addr      = ec->bo.extra.addr,
+            .range     = EXECUTOR_BO_SIZE,
+            .pat_index = ec->devinfo->pat.cached_coherent.index,
+         },
+         {
+            .op        = DRM_XE_VM_BIND_OP_MAP,
+            .obj       = ec->bo.data.handle,
+            .addr      = ec->bo.data.addr,
+            .range     = EXECUTOR_BO_SIZE,
+            .pat_index = ec->devinfo->pat.cached_coherent.index,
+         },
+      };
+
+      struct drm_xe_sync bind_syncs[] = {
+         {
+            .type   = DRM_XE_SYNC_TYPE_SYNCOBJ,
+            .handle = sync_handles[0],
+            .flags  = DRM_XE_SYNC_FLAG_SIGNAL,
+         },
+      };
+
+      struct drm_xe_vm_bind bind = {
+         .vm_id           = ec->xe.vm_id,
+         .num_binds       = ARRAY_SIZE(bind_ops),
+         .vector_of_binds = (uintptr_t)bind_ops,
+         .num_syncs       = 1,
+         .syncs           = (uintptr_t)bind_syncs,
+      };
+
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_BIND, &bind);
+      if (err)
+         failf("xe_vm_bind");
+
+      struct drm_xe_sync exec_syncs[] = {
+         {
+            .type   = DRM_XE_SYNC_TYPE_SYNCOBJ,
+            .handle = sync_handles[0],
+         },
+         {
+            .type   = DRM_XE_SYNC_TYPE_SYNCOBJ,
+            .handle = sync_handles[1],
+            .flags  = DRM_XE_SYNC_FLAG_SIGNAL,
+         }
+      };
+
+      struct drm_xe_exec exec = {
+         .exec_queue_id    = ec->xe.queue_id,
+         .num_batch_buffer = 1,
+         .address          = ec->batch_start,
+         .num_syncs        = 2,
+         .syncs            = (uintptr_t)exec_syncs,
+      };
+      err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC, &exec);
+      if (err)
+         failf("xe_exec");
+
+      struct drm_syncobj_wait wait = {
+         .count_handles = 1,
+         .handles       = (uintptr_t)&sync_handles[1],
+         .timeout_nsec  = INT64_MAX,
+      };
+      err = intel_ioctl(ec->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait);
+      if (err)
+         failf("syncobj_wait");
+   }
+}
+
+static void
+executor_context_teardown(executor_context *ec)
+{
+   executor_destroy_bo(ec, &ec->bo.batch);
+   executor_destroy_bo(ec, &ec->bo.extra);
+   executor_destroy_bo(ec, &ec->bo.data);
+
+   if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
+      struct drm_i915_gem_context_destroy destroy = {
+         .ctx_id = ec->i915.ctx_id,
+      };
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
+      if (err)
+         failf("i915_gem_context_destroy");
+   } else {
+      assert(ec->devinfo->kmd_type == INTEL_KMD_TYPE_XE);
+
+      struct drm_xe_exec_queue_destroy queue_destroy = {
+         .exec_queue_id = ec->xe.queue_id,
+      };
+      int err = intel_ioctl(ec->fd, DRM_IOCTL_XE_EXEC_QUEUE_DESTROY, &queue_destroy);
+      if (err)
+         failf("xe_exec_queue_destroy");
+
+      struct drm_xe_vm_destroy destroy = {
+         .vm_id =  ec->xe.vm_id,
+      };
+      err = intel_ioctl(ec->fd, DRM_IOCTL_XE_VM_DESTROY, &destroy);
+      if (err)
+         failf("xe_vm_destroy");
+   }
+}
+
+static int
+l_execute(lua_State *L)
+{
+   executor_context ec = {
+      .mem_ctx = ralloc_context(NULL),
+      .devinfo = &E.devinfo,
+      .isl_dev = &E.isl_dev,
+      .fd      = E.fd,
+   };
+
+   executor_context_setup(&ec);
+
+   executor_params params = {0};
+
+   {
+      if (lua_gettop(L) != 1)
+         failf("execute() must have a single table argument");
+
+      parse_execute_args(&ec, L, &params);
+
+      const char *src = executor_apply_macros(&ec, params.original_src);
+
+      FILE *f = fmemopen((void *)src, strlen(src), "r");
+      brw_assemble_result asm = brw_assemble(ec.mem_ctx, ec.devinfo, f, "", 0);
+      fclose(f);
+
+      if (INTEL_DEBUG(DEBUG_CS) || !asm.bin) {
+         printf("=== Processed assembly source ===\n"
+                "%s"
+                "=================================\n\n", src);
+      }
+
+      if (!asm.bin)
+         failf("assembler failure");
+
+      params.kernel_bin = asm.bin;
+      params.kernel_size = asm.bin_size;
+   }
+
+   genX_call(emit_execute, &ec, &params);
+
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      struct intel_batch_decode_ctx decoder;
+      enum intel_batch_decode_flags flags = INTEL_BATCH_DECODE_DEFAULT_FLAGS;
+      if (INTEL_DEBUG(DEBUG_COLOR))
+         flags |= INTEL_BATCH_DECODE_IN_COLOR;
+
+      intel_batch_decode_ctx_init_brw(&decoder, &E.isa, &E.devinfo, stdout,
+                                      flags, NULL, decode_get_bo, decode_get_state_size, &ec);
+
+      assert(ec.bo.batch.cursor > ec.bo.batch.map);
+      const int batch_offset = ec.batch_start - ec.bo.batch.addr;
+      const int batch_size = (ec.bo.batch.cursor - ec.bo.batch.map) - batch_offset;
+      assert(batch_offset < batch_size);
+
+      intel_print_batch(&decoder, ec.bo.batch.map, batch_size, ec.batch_start, false);
+
+      intel_batch_decode_ctx_finish(&decoder);
+   }
+
+   executor_context_dispatch(&ec);
+
+   {
+      /* TODO: Use userdata to return a wrapped C array instead of building
+       * values.  Could make integration with array operations better.
+       */
+      uint32_t *data = ec.bo.data.map;
+      const int n = ec.bo.data.size / 4;
+      lua_createtable(L, n, 0);
+      for (int i = 0; i < 8; i++) {
+         lua_pushinteger(L, data[i]);
+         lua_seti(L, -2, i);
+      }
+   }
+
+   executor_context_teardown(&ec);
+   ralloc_free(ec.mem_ctx);
+
+   return 1;
+}
+
+static int
+l_dump(lua_State *L)
+{
+   /* TODO: Use a table to add options for the dump, e.g.
+    * starting offset, format, etc.
+    */
+
+   assert(lua_type(L, 1) == LUA_TTABLE);
+   assert(lua_type(L, 2) == LUA_TNUMBER);
+   assert(lua_isinteger(L, 2));
+
+   lua_Integer len_ = lua_tointeger(L, 2);
+   assert(len_ >= 0 && len_ <= INT_MAX);
+   int len = len_;
+
+   int i;
+   for (i = 0; i < len; i++) {
+      if (i%8 == 0) printf("[0x%08x]", i * 4);
+      lua_rawgeti(L, 1, i);
+      lua_Integer val = lua_tointeger(L, -1);
+      printf(" 0x%08x", (uint32_t)val);
+      lua_pop(L, 1);
+      if (i%8 == 7) printf("\n");
+   }
+   if (i%8 != 0) printf("\n");
+   return 0;
+}
+
+static int
+l_check_ver(lua_State *L)
+{
+   int top = lua_gettop(L);
+   for (int i = 1; i <= top; i++) {
+      lua_Integer v = luaL_checknumber(L, i);
+      if (E.devinfo.ver == v) {
+         return 0;
+      }
+   }
+   failf("script doesn't support version=%d verx10=%d\n",
+         E.devinfo.ver, E.devinfo.verx10);
+   return 0;
+}
+
+static int
+l_check_verx10(lua_State *L)
+{
+   int top = lua_gettop(L);
+   for (int i = 1; i <= top; i++) {
+      lua_Integer v = luaL_checknumber(L, i);
+      if (E.devinfo.verx10 == v) {
+         return 0;
+      }
+   }
+   failf("script doesn't support version=%d verx10=%d\n",
+         E.devinfo.ver, E.devinfo.verx10);
+   return 0;
+}
+
+/* TODO: Review numeric limits in the code, specially around Lua integer
+ * conversion.
+ */
+
+int
+main(int argc, char *argv[])
+{
+   if (argc < 2 ||
+       !strcmp(argv[1], "--help") ||
+       !strcmp(argv[1], "-help") ||
+       !strcmp(argv[1], "-h") ||
+       !strcmp(argv[1], "help")) {
+      print_help();
+      return 0;
+   }
+
+   if (argc > 2) {
+      /* TODO: Expose extra arguments to the script as a variable. */
+      failf("invalid extra arguments\nusage: executor FILENAME");
+      return 1;
+   }
+
+   process_intel_debug_variable();
+
+   E.fd = get_drm_device(&E.devinfo);
+   isl_device_init(&E.isl_dev, &E.devinfo);
+   brw_init_isa_info(&E.isa, &E.devinfo);
+   assert(E.devinfo.kmd_type == INTEL_KMD_TYPE_I915 ||
+          E.devinfo.kmd_type == INTEL_KMD_TYPE_XE);
+
+   lua_State *L = luaL_newstate();
+
+   /* TODO: Could be nice to export some kind of builder interface,
+    * maybe even let the script construct a shader at the BRW IR
+    * level and let the later passes kick in.
+    */
+
+   luaL_openlibs(L);
+
+   lua_pushinteger(L, E.devinfo.ver);
+   lua_setglobal(L, "ver");
+
+   lua_pushinteger(L, E.devinfo.verx10);
+   lua_setglobal(L, "verx10");
+
+   lua_pushcfunction(L, l_execute);
+   lua_setglobal(L, "execute");
+
+   lua_pushcfunction(L, l_dump);
+   lua_setglobal(L, "dump");
+
+   lua_pushcfunction(L, l_check_ver);
+   lua_setglobal(L, "check_ver");
+
+   lua_pushcfunction(L, l_check_verx10);
+   lua_setglobal(L, "check_verx10");
+
+   const char *filename = argv[1];
+   int err = luaL_loadfile(L, filename);
+   if (err)
+      failf("failed to load script: %s", lua_tostring(L, -1));
+
+   err = lua_pcall(L, 0, 0, 0);
+   if (err)
+      failf("failed to run script: %s", lua_tostring(L, -1));
+
+   lua_close(L);
+   close(E.fd);
+
+   return 0;
+}
+
+void
+failf(const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   fprintf(stderr, "ERROR: ");
+   vfprintf(stderr, fmt, args);
+   fprintf(stderr, "\n");
+   va_end(args);
+   exit(1);
+}
--- a/src/intel/executor/meson.build
+++ b/src/intel/executor/meson.build
@ -0,0 +1,58 @@
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+if not dep_lua.found()
+  subdir_done()
+endif
+
+executor_flags = [
+  no_override_init_args,
+  sse2_args,
+]
+
+executor_includes = [
+  inc_include,
+  inc_src,
+  inc_intel,
+]
+
+executor_hw_libs = []
+foreach v: ['90', '110', '120', '125', '200']
+  executor_hw_libs += static_library(
+    'executor_hw_ver@0@'.format(v),
+    ['executor_genx.c', gen_xml_pack],
+    include_directories: [executor_includes],
+    c_args: [
+      executor_flags,
+      '-DGFX_VERx10=@0@'.format(v),
+    ],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [
+      dep_valgrind,
+      idep_genxml,
+    ],
+  )
+endforeach
+
+executor = executable(
+  'executor',
+  [
+    'executor_main.c',
+    'executor_macros.c',
+  ],
+  dependencies: [
+    dep_libdrm,
+    dep_lua,
+    dep_valgrind,
+    idep_brw_asm,
+    idep_genxml,
+    idep_intel_decoder_brw,
+    idep_intel_dev,
+    idep_libintel_common,
+  ],
+  include_directories: [executor_includes],
+  link_with: [executor_hw_libs],
+  c_args: [executor_flags],
+  gnu_symbol_visibility: 'hidden',
+  install: true
+)
--- a/src/intel/meson.build
+++ b/src/intel/meson.build
@ -23,6 +23,7 @@ if with_intel_hasvk or with_intel_vk or with_gallium_iris
 endif
 if with_intel_tools
  subdir('tools')
+  subdir('executor')
 endif
 if get_option('vulkan-layers').contains('intel-nullhw')
  subdir('nullhw-layer')