intel/executor: Add a matrix multiplication example

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37805>
2026-02-15 06:40:27 +01:00 · 2025-10-01 20:31:33 -07:00 · 2025-10-01 20:31:33 -07:00 · 74859c19fb
commit 74859c19fb
parent 1e0ee84841
4 changed files with 661 additions and 5 deletions
--- a/src/intel/executor/examples/matmul.lua
+++ b/src/intel/executor/examples/matmul.lua
@ -0,0 +1,337 @@
+-- Copyright © 2025 Intel Corporation
+-- SPDX-License-Identifier: MIT
+
+local HELP_MESSAGE = [[
+Matrix Multiplication using DPAS
+
+Usage: executor matmul.lua FORMAT A_FILE B_FILE [C_FILE]
+
+Perform matrix multiplication D = A * B + C using DPAS instruction.
+If C is not provided, will be equivalent to all zeros.
+
+Input files have values separated by spaces, with the rows
+separated by newlines.  Values are expected to be valid for
+the format, e.g. if a matrix should contain UB (unsigned byte),
+values should be between 0-255.  Float-pointing data is
+expected to be in "raw" form as hexadecimal values.
+
+Matrices smaller than the maximum dimensions will be automatically
+zero-padded to the required size for DPAS computation.
+
+Maximum dimensions are limited by data format and hardware version.
+
+   Gfx20+
+   - HF/F:  A max 8x16, B max 16x16, C/D max 8x16
+   - BF/F:  A max 8x16, B max 16x16, C/D max 8x16
+   - UB/UD: A max 8x32, B max 32x16, C/D max 8x16
+
+   Gfx125
+   - HF/F:  A max 8x16, B max 16x8, C/D max 8x8
+   - BF/F:  A max 8x16, B max 16x8, C/D max 8x8
+   - UB/UD: A max 8x32, B max 32x8, C/D max 8x8
+
+If `octave` is installed, it will be used to verify the results.
+]]
+
+-- TODO: Change this program to load matrix data from memory
+-- instead of setting data as immediates in the shader code.
+
+if not devinfo.has_dpas then
+  print("DPAS not supported on this platform.")
+  os.exit(1)
+end
+
+local verify_results = false
+do
+  local handle = io.popen("which octave 2>/dev/null")
+  local octave_path = handle:read("*a")
+  handle:close()
+  verify_results = octave_path and #octave_path > 0
+end
+
+local gen = require("mod/gen")
+local matrix = require("mod/matrix")
+local fp = require("mod/fp")
+
+local format_ab, format_cd, a_file, b_file, c_file
+
+for i = 1, #arg do
+  if arg[i] == "--help" or arg[i] == "-h" then
+    print(HELP_MESSAGE)
+    os.exit(0)
+  elseif not format_ab then
+    local format = arg[i]:upper()
+    format_ab = format:sub(1, format:find("/") - 1)
+    format_cd = format:sub(format:find("/") + 1)
+  elseif not a_file then a_file = arg[i]
+  elseif not b_file then b_file = arg[i]
+  elseif not c_file then c_file = arg[i]
+  end
+end
+
+if not format_ab or not format_cd or not a_file or not b_file then
+  print("Usage: executor matmul.lua FORMAT A_FILE B_FILE [C_FILE]")
+  print("Use --help for more information")
+  os.exit(1)
+end
+
+if not ((format_ab == "HF" and format_cd == "F") or
+        (format_ab == "BF" and format_cd == "F") or
+        (format_ab == "UB" and format_cd == "UD")) then
+  print("Error: format must be 'HF/F', 'BF/F', or 'UB/UD', got '" .. format_ab .. "/" .. format_cd .. "'")
+  print("Use --help for more information")
+  os.exit(1)
+end
+
+local function read_matrix(m, filename, max_rows, max_cols)
+  local file = io.open(filename, "r")
+  if not file then
+    error("Failed to open file: " .. filename)
+  end
+
+  -- Read entire file content first
+  local raw_content = file:read("*all")
+  file:close()
+
+  local rows_data = {}
+  local cols = nil
+
+  -- Parse from the raw content string
+  for line in raw_content:gmatch("[^\r\n]+") do
+    local row = {}
+    for val in line:gmatch("%S+") do
+      local num = tonumber(val)
+      if not num then
+        error(string.format(
+          "Error reading matrix from '%s': invalid number in row %d: " .. val,
+          filename, #rows_data + 1))
+      end
+
+      table.insert(row, num)
+    end
+
+    if #row > 0 then
+      if cols == nil then
+        cols = #row
+      elseif #row ~= cols then
+        error(string.format(
+          "Error reading matrix from %s: inconsistent number of columns (%d) in row %d",
+          filename, #row, #rows_data + 1))
+      end
+      table.insert(rows_data, row)
+    end
+  end
+
+  local rows = #rows_data
+
+  if rows > max_rows then
+    error(string.format(
+      "Error reading matrix from %s: too many rows (%d), maximum is %d",
+      filename, rows, max_rows))
+  end
+
+  if cols > max_cols then
+    error(string.format(
+      "Error reading matrix from %s: too many columns (%d), maximum is %d",
+      filename, cols, max_cols))
+  end
+
+  for i = 1, rows do
+    for j = 1, cols do
+      -- Matrix indices are zero indexed.
+      m:set(i-1, j-1, rows_data[i][j])
+    end
+  end
+
+  return rows, cols, raw_content
+end
+
+local exec_size = devinfo.ver >= 20 and 16 or 8
+local packing_factor
+
+if format_ab == "HF" or format_ab == "BF" then
+  packing_factor = 2
+elseif format_ab == "UB" then
+  packing_factor = 4
+end
+
+local max_a = { rows = 8, cols = packing_factor * 8 }
+local max_b = { rows = packing_factor * 8, cols = exec_size }
+local max_c = { rows = 8, cols = exec_size }
+
+local a = matrix.new(max_a.rows, max_a.cols, 0)
+local b = matrix.new(max_b.rows, max_b.cols, 0)
+local c = matrix.new(max_c.rows, max_c.cols, 0)
+
+local actual_a_rows, actual_a_cols, a_raw_content = read_matrix(a, a_file, max_a.rows, max_a.cols)
+local actual_b_rows, actual_b_cols, b_raw_content = read_matrix(b, b_file, max_b.rows, max_b.cols)
+
+if actual_a_cols ~= actual_b_rows then
+  error(string.format(
+    "Matrix dimension mismatch: A is %dx%d, B is %dx%d. A columns (%d) must equal B rows (%d)",
+    actual_a_rows, actual_a_cols, actual_b_rows, actual_b_cols, actual_a_cols, actual_b_rows))
+end
+
+local actual_c_rows, actual_c_cols, c_raw_content
+if c_file then
+  actual_c_rows, actual_c_cols, c_raw_content = read_matrix(c, c_file, max_c.rows, max_c.cols)
+  if actual_a_rows ~= actual_c_rows or actual_b_cols ~= actual_c_cols then
+    error(string.format(
+      "Matrix dimension mismatch: A*B would be %dx%d, but C is %dx%d",
+      actual_a_rows, actual_b_cols, actual_c_rows, actual_c_cols))
+  end
+else
+  -- C defaults to zeros with dimensions matching A*B result
+  actual_c_rows, actual_c_cols = actual_a_rows, actual_b_cols
+  c_raw_content = nil
+end
+
+local exec_size = c.cols
+
+local encode = function(m, fmt)
+  local f = nil
+  if     fmt == "HF" then f = fp.encode_f16
+  elseif fmt == "BF" then f = fp.encode_bf16
+  elseif fmt == "F"  then f = fp.encode_f32
+  end
+
+  if f then
+    m:apply(f)
+  end
+end
+
+encode(a, format_ab)
+encode(b, format_ab)
+encode(c, format_cd)
+
+local buf = execute {
+  src =
+    [[]]
+    .. gen.mov_grf(format_ab, 10, a:to_row_major())
+    .. gen.mov_grf(format_ab, 20, b:to_interleaved_row_major(packing_factor))
+    .. gen.mov_grf(format_cd, 30, c:to_row_major())
+    .. string.format([[
+
+    dpas.8x8(%d)  r40<1>%s  r30<1>%s  r20<1>%s  r10<1>%s  {A@1 $1};
+    @syncnop
+
+    ]], exec_size, format_cd, format_cd, format_ab, format_ab)
+    .. gen.write_grfs(40, 8)
+    .. [[
+    @eot
+    ]],
+}
+
+local d = matrix.from_row_major_buffer(8, exec_size, buf)
+
+local d_print_fmt = nil
+if string.find(format_cd, "F") then
+  d_print_fmt = "%.6f"
+
+  local f = nil
+  if     format_cd == "HF" then f = fp.decode_f16
+  elseif format_cd == "BF" then f = fp.decode_bf16
+  elseif format_cd == "F"  then f = fp.decode_f32
+  else
+    error("unsupported float format")
+  end
+
+  d:apply(f)
+end
+
+-- Just consider the actual rows, same as C matrix.
+d:print_submatrix(actual_c_rows, actual_c_cols, d_print_fmt)
+
+--
+-- VERIFICATION USING OCTAVE.
+--
+
+if verify_results then
+  local function save_matrix_to_temp(m, rows, cols)
+    local filename = os.tmpname()
+    local file = io.open(filename, "w")
+    for i = 0, rows - 1 do
+      local row = {}
+      for j = 0, cols - 1 do
+        table.insert(row, tostring(m:get(i, j)))
+      end
+      file:write(table.concat(row, " ") .. "\n")
+    end
+    file:close()
+    return filename
+  end
+
+  local function save_raw_content_to_temp(raw_content)
+    local filename = os.tmpname()
+    local file = io.open(filename, "w")
+    file:write(raw_content)
+    file:close()
+    return filename
+  end
+
+  -- Save A and B raw contents to temp files for Octave
+  local a_for_octave = save_raw_content_to_temp(a_raw_content)
+  local b_for_octave = save_raw_content_to_temp(b_raw_content)
+  local d_for_octave = save_matrix_to_temp(d, actual_c_rows, actual_c_cols)
+
+  local c_load, c_for_octave
+  if c_raw_content then
+    c_for_octave = save_raw_content_to_temp(c_raw_content)
+    c_load = string.format("C = single(dlmread('%s'));", c_for_octave)
+  else
+    c_load = string.format("C = single(zeros(%d, %d));", actual_c_rows, actual_c_cols)
+  end
+
+  local tolerance = (format_cd == "F") and "1e-3" or "1e-6"
+
+  -- TODO: Currently values are rounded to what fits in F (32-bit), but for
+  -- better results we should have a way to round them to precision based on
+  -- format, and handle HF and BF16. Octave doesn't support those types
+  -- natively.  See https://github.com/higham/chop for Matlab version of this.
+
+  local octave_script = string.format([[
+A = single(dlmread('%s'));
+B = single(dlmread('%s'));
+%s
+D = single(dlmread('%s'));
+
+D_expected = A * B + C;
+
+%% Use relative tolerance for better comparison across different magnitudes.
+max_val = max(abs(D_expected(:)));
+tol = max_val * %s;
+
+if all(all(abs(D_expected - D) < tol))
+  exit(0);
+else
+  disp('MISMATCH!');
+  disp('Octave result:');
+  disp(D_expected);
+  disp('DPAS result:');
+  disp(D);
+  exit(1);
+endif
+]], a_for_octave, b_for_octave, c_load, d_for_octave, tolerance)
+
+  local exit_code = os.execute(
+    [[octave --quiet --no-gui --eval "]]
+    .. octave_script ..
+    [[" 2>&1]])
+
+  -- Clean up temporary files
+  os.remove(a_for_octave)
+  os.remove(b_for_octave)
+  if c_for_octave then
+    os.remove(c_for_octave)
+  end
+  os.remove(d_for_octave)
+
+  if exit_code then
+    print("\nMatches Octave.")
+  else
+    print("\nMISMATCH with Octave!")
+    os.exit(1)
+  end
+else
+  print("\nNOTE: Install `octave` to verify the results.")
+end
--- a/src/intel/executor/examples/mod/fp.lua
+++ b/src/intel/executor/examples/mod/fp.lua
@ -0,0 +1,128 @@
+-- Copyright © 2025 Intel Corporation
+-- SPDX-License-Identifier: MIT
+--
+-- Encode and decode floating point types.
+--
+-- Just enough to get basic usage of the examples.  If this get serious might
+-- be a good idea to implement it using the existing Mesa routines and exposing
+-- as part of executor API.
+
+local M = {}
+
+M.decode_float = function(bits, mantissa_bits, exponent_bits, bias)
+  local total_bits = 1 + exponent_bits + mantissa_bits
+
+  local sign_mask = 1 << (total_bits - 1)
+  local exponent_mask = ((1 << exponent_bits) - 1) << mantissa_bits
+  local mantissa_mask = (1 << mantissa_bits) - 1
+
+  local sign = (bits & sign_mask) ~= 0
+  local exponent = (bits & exponent_mask) >> mantissa_bits
+  local mantissa = bits & mantissa_mask
+
+  if exponent == 0 then
+    if mantissa == 0 then
+      return sign and "-0.0" or "0.0"
+    else
+      -- Subnormal number.  They don't have implicit leading 1, so
+      -- the number corresponds to "0.mantissa * 2^(1-bias)".
+      local value = mantissa / (1 << mantissa_bits)
+      value = value * (2 ^ (1 - bias))
+      if sign then value = -value end
+      return string.format("%.17g", value)
+    end
+  elseif exponent == (1 << exponent_bits) - 1 then
+    if mantissa == 0 then
+      return sign and "-inf" or "inf"
+    else
+      return "nan"
+    end
+  else
+    -- Normal numbers have implicit leading 1, so the
+    -- number corresponds to "1.mantissa * 2^(exponent-bias)".
+    local value = 1.0 + (mantissa / (1 << mantissa_bits))
+    value = value * (2 ^ (exponent - bias))
+    if sign then value = -value end
+    return string.format("%.17g", value)
+  end
+end
+
+M.decode_f32 = function(bits)
+  return M.decode_float(bits, 23, 8, 127)
+end
+
+M.decode_f16 = function(bits)
+  return M.decode_float(bits, 10, 5, 15)
+end
+
+M.decode_bf16 = function(bits)
+  return M.decode_float(bits, 7, 8, 127)
+end
+
+M.encode_float = function(value_str, mantissa_bits, exponent_bits, bias)
+  local value = tonumber(value_str)
+  if not value then
+    return nil
+  end
+
+  local total_bits = 1 + exponent_bits + mantissa_bits
+  local max_exponent = (1 << exponent_bits) - 1
+
+  local sign_bit = value < 0 and (1 << (total_bits - 1)) or 0
+  local signed_inf = sign_bit | max_exponent << mantissa_bits
+
+  -- Handle various special cases first: signed zero, NaN, Inf/-Inf.
+  if value == 0.0 then
+    return sign_bit
+  elseif value ~= value then
+    return (1 << (total_bits - 1)) | (max_exponent << mantissa_bits) | 1
+  elseif value == math.huge or value == -math.huge then
+    return signed_inf
+  end
+
+  -- Do math with the absolute value from now on.
+  if sign_bit ~= 0 then value = -value end
+
+  local exponent = math.floor(math.log(value) / math.log(2))
+
+  local mantissa_value = value / (2 ^ exponent) - 1.0
+
+  exponent = exponent + bias
+
+  if exponent <= 0 then
+    -- Subnormal: no implicit leading 1, use minimum exponent
+    mantissa_value = value / (2 ^ (1 - bias))
+    exponent = 0
+  elseif exponent >= max_exponent then
+    -- Value too large to represent.
+    return signed_inf
+  end
+
+  local mantissa = math.floor(mantissa_value * (1 << mantissa_bits) + 0.5)
+
+  if mantissa >= (1 << mantissa_bits) then
+    -- Rounding caused mantissa to overflow, increment exponent.
+    mantissa = 0
+    exponent = exponent + 1
+    if exponent >= max_exponent then
+      -- Value too large to represent.
+      return signed_inf
+    end
+  end
+
+  return sign_bit | (exponent << mantissa_bits) | mantissa
+end
+
+M.encode_f32 = function(value_str)
+  return M.encode_float(value_str, 23, 8, 127)
+end
+
+M.encode_f16 = function(value_str)
+  return M.encode_float(value_str, 10, 5, 15)
+end
+
+M.encode_bf16 = function(value_str)
+  return M.encode_float(value_str, 7, 8, 127)
+end
+
+return M
--- a/src/intel/executor/examples/mod/matrix.lua
+++ b/src/intel/executor/examples/mod/matrix.lua
@ -22,20 +22,24 @@ Matrix.set = function(self, i, j, value)
  self.data[i][j] = value
 end

-Matrix.print = function(self, fmt)
+Matrix.print_submatrix = function(self, rows, cols, fmt)
  local fmt = fmt or "%4u"
-  print(string.format("# %dx%d matrix", self.rows, self.cols))
+  print(string.format("# %dx%d matrix", rows, cols))
  io.write("[\n")
-  for i = 0, self.rows - 1 do
-    for j = 0, self.cols - 1 do
+  for i = 0, rows - 1 do
+    for j = 0, cols - 1 do
      io.write(string.format(fmt, self.data[i][j]))
-      if j < self.cols - 1 then io.write(" ") end
+      if j < cols - 1 then io.write(" ") end
    end
    io.write("\n")
  end
  io.write("]\n")
 end

+Matrix.print = function(self, fmt)
+  self:print_submatrix(self.rows, self.cols, fmt)
+end
+
 -- "Interleaved" row major is like row major except that
 -- elements from `packing_factor` rows are packed together.
 --
@ -136,4 +140,12 @@ M.from_row_major_buffer = function(rows, cols, data)
  return self
 end

+Matrix.apply = function(self, func)
+  for i = 0, self.rows - 1 do
+    for j = 0, self.cols - 1 do
+      self.data[i][j] = func(self.data[i][j])
+    end
+  end
+end
+
 return M
--- a/src/intel/executor/examples/test_matmul.sh
+++ b/src/intel/executor/examples/test_matmul.sh
@ -0,0 +1,179 @@
+#!/bin/bash
+
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+if ! command -v executor &> /dev/null; then
+    echo "ERROR: executor command not found." >&2
+    exit 1
+fi
+
+set -e
+
+executor matmul.lua UB/UD \
+  <(cat <<EOF
+  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+ 32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+ 64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+ 96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+EOF
+) \
+  <(cat <<EOF
+  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+ 16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+ 32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+ 48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+ 64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+ 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+ 96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+ 16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+ 32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+ 48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+ 64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+ 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+ 96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+EOF
+)
+
+executor matmul.lua HF/F \
+  <(cat <<EOF
+  0.25  1.75  2.00  3.50  4.00  5.00  6.00  7.00  8.00  9.00 10.00 11.00 12.00 13.00 14.00 15.00
+ 16.00 17.00 18.00 19.00 20.00 21.00 22.75 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
+ 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.25 46.00 47.00
+ 48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
+ 64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00
+ 80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
+ 96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
+112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
+EOF
+) \
+  <(cat <<EOF
+  0.00  1.00  2.00  3.00  4.00  5.00  6.00  7.00  8.00  9.25 10.00 11.00 12.00 13.00 14.00 15.00
+ 16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
+ 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.00 46.00 47.00
+ 48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
+ 64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.75 75.00 76.00 77.00 78.00 79.00
+ 80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
+ 96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
+112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
+128.00 129.0 130.0 131.0 132.0 133.0 134.0 135.0 136.0 137.0 138.0 139.0 140.0 141.0 142.0 143.0
+144.00 145.0 146.0 147.0 148.0 149.0 150.0 151.0 152.0 153.0 154.0 155.0 156.0 157.0 158.0 159.0
+160.00 161.0 162.0 163.0 164.0 165.0 166.0 167.0 168.0 169.0 170.0 171.0 172.0 173.0 174.0 175.0
+176.00 177.0 178.0 179.0 180.0 181.0 182.0 183.0 184.0 185.0 186.0 187.0 188.0 189.0 190.0 191.0
+192.00 193.0 194.0 195.0 196.0 197.0 198.0 199.0 200.0 201.0 202.0 203.0 204.0 205.0 206.0 207.0
+208.00 209.0 210.0 211.0 212.0 213.0 214.0 215.0 216.0 217.0 218.0 219.0 220.0 221.0 222.0 223.0
+224.00 225.0 226.0 227.0 228.0 229.0 230.0 231.0 232.0 233.0 234.0 235.0 236.0 237.0 238.0 239.0
+240.00 241.0 242.0 243.0 244.0 245.0 246.0 247.0 248.0 249.0 250.0 251.0 252.0 253.0 254.0 255.0
+EOF
+)
+
+executor matmul.lua UB/UD \
+  <(cat <<EOF
+ 0  1  2  3  4  5  6  7
+ 8  9 10 11 12 13 14 15
+16 17 18 19 20 21 22 23
+24 25 26 27 28 29 30 31
+EOF
+) \
+  <(cat <<EOF
+1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1
+EOF
+)
+
+executor matmul.lua HF/F \
+  <(cat <<EOF
+ 0.25  1.00  2.75  3.00  4.25
+ 5.00  6.75  7.00  8.50  9.00
+10.25 11.00 12.75 13.00 14.25
+EOF
+) \
+  <(cat <<EOF
+ 0.0  1.25  2.0  3.75  4.0  5.75
+ 6.0  7.25  8.0  9.50 10.0 11.25
+12.0 13.75 14.0 15.75 16.0 17.25
+18.0 19.50 20.0 21.25 22.0 23.75
+24.0 25.75 26.0 27.25 28.0 29.50
+EOF
+)
+
+executor matmul.lua UB/UD <(echo "22") <(echo "1")
+
+executor matmul.lua BF/F \
+  <(cat <<EOF
+  0.25  1.75  2.00  3.50  4.00  5.00  6.00  7.00  8.00  9.00 10.00 11.00 12.00 13.00 14.00 15.00
+ 16.00 17.00 18.00 19.00 20.00 21.00 22.75 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
+ 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.25 46.00 47.00
+ 48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
+ 64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00
+ 80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
+ 96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
+112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
+EOF
+) \
+  <(cat <<EOF
+  0.00  1.00  2.00  3.00  4.00  5.00  6.00  7.00  8.00  9.25 10.00 11.00 12.00 13.00 14.00 15.00
+ 16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
+ 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.00 46.00 47.00
+ 48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
+ 64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.75 75.00 76.00 77.00 78.00 79.00
+ 80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
+ 96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
+112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
+128.00 129.0 130.0 131.0 132.0 133.0 134.0 135.0 136.0 137.0 138.0 139.0 140.0 141.0 142.0 143.0
+144.00 145.0 146.0 147.0 148.0 149.0 150.0 151.0 152.0 153.0 154.0 155.0 156.0 157.0 158.0 159.0
+160.00 161.0 162.0 163.0 164.0 165.0 166.0 167.0 168.0 169.0 170.0 171.0 172.0 173.0 174.0 175.0
+176.00 177.0 178.0 179.0 180.0 181.0 182.0 183.0 184.0 185.0 186.0 187.0 188.0 189.0 190.0 191.0
+192.00 193.0 194.0 195.0 196.0 197.0 198.0 199.0 200.0 201.0 202.0 203.0 204.0 205.0 206.0 207.0
+208.00 209.0 210.0 211.0 212.0 213.0 214.0 215.0 216.0 217.0 218.0 219.0 220.0 221.0 222.0 223.0
+224.00 225.0 226.0 227.0 228.0 229.0 230.0 231.0 232.0 233.0 234.0 235.0 236.0 237.0 238.0 239.0
+240.00 241.0 242.0 243.0 244.0 245.0 246.0 247.0 248.0 249.0 250.0 251.0 252.0 253.0 254.0 255.0
+EOF
+)
+
+executor matmul.lua BF/F \
+  <(cat <<EOF
+ 1.25  2.75  3.00  4.25
+ 5.00  6.75  7.50  8.00
+ 9.25 10.00 11.75 12.25
+EOF
+) \
+  <(cat <<EOF
+ 2.0  1.25  3.75  2.0  1.0
+ 1.0  3.00  2.00  1.50 3.0
+ 3.75 2.00  1.00  3.00 2.0
+ 2.0  1.25  3.00  2.00 1.75
+EOF
+)
+
+echo ""
+echo "All tests passed!"
+