intel/executor: Add a matrix multiplication example
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37805>
This commit is contained in:
Caio Oliveira 2025-10-01 20:31:33 -07:00 committed by Marge Bot
parent 1e0ee84841
commit 74859c19fb
4 changed files with 661 additions and 5 deletions

View file

@ -0,0 +1,337 @@
-- Copyright © 2025 Intel Corporation
-- SPDX-License-Identifier: MIT
local HELP_MESSAGE = [[
Matrix Multiplication using DPAS
Usage: executor matmul.lua FORMAT A_FILE B_FILE [C_FILE]
Perform matrix multiplication D = A * B + C using DPAS instruction.
If C is not provided, will be equivalent to all zeros.
Input files have values separated by spaces, with the rows
separated by newlines. Values are expected to be valid for
the format, e.g. if a matrix should contain UB (unsigned byte),
values should be between 0-255. Float-pointing data is
expected to be in "raw" form as hexadecimal values.
Matrices smaller than the maximum dimensions will be automatically
zero-padded to the required size for DPAS computation.
Maximum dimensions are limited by data format and hardware version.
Gfx20+
- HF/F: A max 8x16, B max 16x16, C/D max 8x16
- BF/F: A max 8x16, B max 16x16, C/D max 8x16
- UB/UD: A max 8x32, B max 32x16, C/D max 8x16
Gfx125
- HF/F: A max 8x16, B max 16x8, C/D max 8x8
- BF/F: A max 8x16, B max 16x8, C/D max 8x8
- UB/UD: A max 8x32, B max 32x8, C/D max 8x8
If `octave` is installed, it will be used to verify the results.
]]
-- TODO: Change this program to load matrix data from memory
-- instead of setting data as immediates in the shader code.
if not devinfo.has_dpas then
print("DPAS not supported on this platform.")
os.exit(1)
end
local verify_results = false
do
local handle = io.popen("which octave 2>/dev/null")
local octave_path = handle:read("*a")
handle:close()
verify_results = octave_path and #octave_path > 0
end
local gen = require("mod/gen")
local matrix = require("mod/matrix")
local fp = require("mod/fp")
local format_ab, format_cd, a_file, b_file, c_file
for i = 1, #arg do
if arg[i] == "--help" or arg[i] == "-h" then
print(HELP_MESSAGE)
os.exit(0)
elseif not format_ab then
local format = arg[i]:upper()
format_ab = format:sub(1, format:find("/") - 1)
format_cd = format:sub(format:find("/") + 1)
elseif not a_file then a_file = arg[i]
elseif not b_file then b_file = arg[i]
elseif not c_file then c_file = arg[i]
end
end
if not format_ab or not format_cd or not a_file or not b_file then
print("Usage: executor matmul.lua FORMAT A_FILE B_FILE [C_FILE]")
print("Use --help for more information")
os.exit(1)
end
if not ((format_ab == "HF" and format_cd == "F") or
(format_ab == "BF" and format_cd == "F") or
(format_ab == "UB" and format_cd == "UD")) then
print("Error: format must be 'HF/F', 'BF/F', or 'UB/UD', got '" .. format_ab .. "/" .. format_cd .. "'")
print("Use --help for more information")
os.exit(1)
end
local function read_matrix(m, filename, max_rows, max_cols)
local file = io.open(filename, "r")
if not file then
error("Failed to open file: " .. filename)
end
-- Read entire file content first
local raw_content = file:read("*all")
file:close()
local rows_data = {}
local cols = nil
-- Parse from the raw content string
for line in raw_content:gmatch("[^\r\n]+") do
local row = {}
for val in line:gmatch("%S+") do
local num = tonumber(val)
if not num then
error(string.format(
"Error reading matrix from '%s': invalid number in row %d: " .. val,
filename, #rows_data + 1))
end
table.insert(row, num)
end
if #row > 0 then
if cols == nil then
cols = #row
elseif #row ~= cols then
error(string.format(
"Error reading matrix from %s: inconsistent number of columns (%d) in row %d",
filename, #row, #rows_data + 1))
end
table.insert(rows_data, row)
end
end
local rows = #rows_data
if rows > max_rows then
error(string.format(
"Error reading matrix from %s: too many rows (%d), maximum is %d",
filename, rows, max_rows))
end
if cols > max_cols then
error(string.format(
"Error reading matrix from %s: too many columns (%d), maximum is %d",
filename, cols, max_cols))
end
for i = 1, rows do
for j = 1, cols do
-- Matrix indices are zero indexed.
m:set(i-1, j-1, rows_data[i][j])
end
end
return rows, cols, raw_content
end
local exec_size = devinfo.ver >= 20 and 16 or 8
local packing_factor
if format_ab == "HF" or format_ab == "BF" then
packing_factor = 2
elseif format_ab == "UB" then
packing_factor = 4
end
local max_a = { rows = 8, cols = packing_factor * 8 }
local max_b = { rows = packing_factor * 8, cols = exec_size }
local max_c = { rows = 8, cols = exec_size }
local a = matrix.new(max_a.rows, max_a.cols, 0)
local b = matrix.new(max_b.rows, max_b.cols, 0)
local c = matrix.new(max_c.rows, max_c.cols, 0)
local actual_a_rows, actual_a_cols, a_raw_content = read_matrix(a, a_file, max_a.rows, max_a.cols)
local actual_b_rows, actual_b_cols, b_raw_content = read_matrix(b, b_file, max_b.rows, max_b.cols)
if actual_a_cols ~= actual_b_rows then
error(string.format(
"Matrix dimension mismatch: A is %dx%d, B is %dx%d. A columns (%d) must equal B rows (%d)",
actual_a_rows, actual_a_cols, actual_b_rows, actual_b_cols, actual_a_cols, actual_b_rows))
end
local actual_c_rows, actual_c_cols, c_raw_content
if c_file then
actual_c_rows, actual_c_cols, c_raw_content = read_matrix(c, c_file, max_c.rows, max_c.cols)
if actual_a_rows ~= actual_c_rows or actual_b_cols ~= actual_c_cols then
error(string.format(
"Matrix dimension mismatch: A*B would be %dx%d, but C is %dx%d",
actual_a_rows, actual_b_cols, actual_c_rows, actual_c_cols))
end
else
-- C defaults to zeros with dimensions matching A*B result
actual_c_rows, actual_c_cols = actual_a_rows, actual_b_cols
c_raw_content = nil
end
local exec_size = c.cols
local encode = function(m, fmt)
local f = nil
if fmt == "HF" then f = fp.encode_f16
elseif fmt == "BF" then f = fp.encode_bf16
elseif fmt == "F" then f = fp.encode_f32
end
if f then
m:apply(f)
end
end
encode(a, format_ab)
encode(b, format_ab)
encode(c, format_cd)
local buf = execute {
src =
[[]]
.. gen.mov_grf(format_ab, 10, a:to_row_major())
.. gen.mov_grf(format_ab, 20, b:to_interleaved_row_major(packing_factor))
.. gen.mov_grf(format_cd, 30, c:to_row_major())
.. string.format([[
dpas.8x8(%d) r40<1>%s r30<1>%s r20<1>%s r10<1>%s {A@1 $1};
@syncnop
]], exec_size, format_cd, format_cd, format_ab, format_ab)
.. gen.write_grfs(40, 8)
.. [[
@eot
]],
}
local d = matrix.from_row_major_buffer(8, exec_size, buf)
local d_print_fmt = nil
if string.find(format_cd, "F") then
d_print_fmt = "%.6f"
local f = nil
if format_cd == "HF" then f = fp.decode_f16
elseif format_cd == "BF" then f = fp.decode_bf16
elseif format_cd == "F" then f = fp.decode_f32
else
error("unsupported float format")
end
d:apply(f)
end
-- Just consider the actual rows, same as C matrix.
d:print_submatrix(actual_c_rows, actual_c_cols, d_print_fmt)
--
-- VERIFICATION USING OCTAVE.
--
if verify_results then
local function save_matrix_to_temp(m, rows, cols)
local filename = os.tmpname()
local file = io.open(filename, "w")
for i = 0, rows - 1 do
local row = {}
for j = 0, cols - 1 do
table.insert(row, tostring(m:get(i, j)))
end
file:write(table.concat(row, " ") .. "\n")
end
file:close()
return filename
end
local function save_raw_content_to_temp(raw_content)
local filename = os.tmpname()
local file = io.open(filename, "w")
file:write(raw_content)
file:close()
return filename
end
-- Save A and B raw contents to temp files for Octave
local a_for_octave = save_raw_content_to_temp(a_raw_content)
local b_for_octave = save_raw_content_to_temp(b_raw_content)
local d_for_octave = save_matrix_to_temp(d, actual_c_rows, actual_c_cols)
local c_load, c_for_octave
if c_raw_content then
c_for_octave = save_raw_content_to_temp(c_raw_content)
c_load = string.format("C = single(dlmread('%s'));", c_for_octave)
else
c_load = string.format("C = single(zeros(%d, %d));", actual_c_rows, actual_c_cols)
end
local tolerance = (format_cd == "F") and "1e-3" or "1e-6"
-- TODO: Currently values are rounded to what fits in F (32-bit), but for
-- better results we should have a way to round them to precision based on
-- format, and handle HF and BF16. Octave doesn't support those types
-- natively. See https://github.com/higham/chop for Matlab version of this.
local octave_script = string.format([[
A = single(dlmread('%s'));
B = single(dlmread('%s'));
%s
D = single(dlmread('%s'));
D_expected = A * B + C;
%% Use relative tolerance for better comparison across different magnitudes.
max_val = max(abs(D_expected(:)));
tol = max_val * %s;
if all(all(abs(D_expected - D) < tol))
exit(0);
else
disp('MISMATCH!');
disp('Octave result:');
disp(D_expected);
disp('DPAS result:');
disp(D);
exit(1);
endif
]], a_for_octave, b_for_octave, c_load, d_for_octave, tolerance)
local exit_code = os.execute(
[[octave --quiet --no-gui --eval "]]
.. octave_script ..
[[" 2>&1]])
-- Clean up temporary files
os.remove(a_for_octave)
os.remove(b_for_octave)
if c_for_octave then
os.remove(c_for_octave)
end
os.remove(d_for_octave)
if exit_code then
print("\nMatches Octave.")
else
print("\nMISMATCH with Octave!")
os.exit(1)
end
else
print("\nNOTE: Install `octave` to verify the results.")
end

View file

@ -0,0 +1,128 @@
-- Copyright © 2025 Intel Corporation
-- SPDX-License-Identifier: MIT
--
-- Encode and decode floating point types.
--
-- Just enough to get basic usage of the examples. If this get serious might
-- be a good idea to implement it using the existing Mesa routines and exposing
-- as part of executor API.
local M = {}
M.decode_float = function(bits, mantissa_bits, exponent_bits, bias)
local total_bits = 1 + exponent_bits + mantissa_bits
local sign_mask = 1 << (total_bits - 1)
local exponent_mask = ((1 << exponent_bits) - 1) << mantissa_bits
local mantissa_mask = (1 << mantissa_bits) - 1
local sign = (bits & sign_mask) ~= 0
local exponent = (bits & exponent_mask) >> mantissa_bits
local mantissa = bits & mantissa_mask
if exponent == 0 then
if mantissa == 0 then
return sign and "-0.0" or "0.0"
else
-- Subnormal number. They don't have implicit leading 1, so
-- the number corresponds to "0.mantissa * 2^(1-bias)".
local value = mantissa / (1 << mantissa_bits)
value = value * (2 ^ (1 - bias))
if sign then value = -value end
return string.format("%.17g", value)
end
elseif exponent == (1 << exponent_bits) - 1 then
if mantissa == 0 then
return sign and "-inf" or "inf"
else
return "nan"
end
else
-- Normal numbers have implicit leading 1, so the
-- number corresponds to "1.mantissa * 2^(exponent-bias)".
local value = 1.0 + (mantissa / (1 << mantissa_bits))
value = value * (2 ^ (exponent - bias))
if sign then value = -value end
return string.format("%.17g", value)
end
end
M.decode_f32 = function(bits)
return M.decode_float(bits, 23, 8, 127)
end
M.decode_f16 = function(bits)
return M.decode_float(bits, 10, 5, 15)
end
M.decode_bf16 = function(bits)
return M.decode_float(bits, 7, 8, 127)
end
M.encode_float = function(value_str, mantissa_bits, exponent_bits, bias)
local value = tonumber(value_str)
if not value then
return nil
end
local total_bits = 1 + exponent_bits + mantissa_bits
local max_exponent = (1 << exponent_bits) - 1
local sign_bit = value < 0 and (1 << (total_bits - 1)) or 0
local signed_inf = sign_bit | max_exponent << mantissa_bits
-- Handle various special cases first: signed zero, NaN, Inf/-Inf.
if value == 0.0 then
return sign_bit
elseif value ~= value then
return (1 << (total_bits - 1)) | (max_exponent << mantissa_bits) | 1
elseif value == math.huge or value == -math.huge then
return signed_inf
end
-- Do math with the absolute value from now on.
if sign_bit ~= 0 then value = -value end
local exponent = math.floor(math.log(value) / math.log(2))
local mantissa_value = value / (2 ^ exponent) - 1.0
exponent = exponent + bias
if exponent <= 0 then
-- Subnormal: no implicit leading 1, use minimum exponent
mantissa_value = value / (2 ^ (1 - bias))
exponent = 0
elseif exponent >= max_exponent then
-- Value too large to represent.
return signed_inf
end
local mantissa = math.floor(mantissa_value * (1 << mantissa_bits) + 0.5)
if mantissa >= (1 << mantissa_bits) then
-- Rounding caused mantissa to overflow, increment exponent.
mantissa = 0
exponent = exponent + 1
if exponent >= max_exponent then
-- Value too large to represent.
return signed_inf
end
end
return sign_bit | (exponent << mantissa_bits) | mantissa
end
M.encode_f32 = function(value_str)
return M.encode_float(value_str, 23, 8, 127)
end
M.encode_f16 = function(value_str)
return M.encode_float(value_str, 10, 5, 15)
end
M.encode_bf16 = function(value_str)
return M.encode_float(value_str, 7, 8, 127)
end
return M

View file

@ -22,20 +22,24 @@ Matrix.set = function(self, i, j, value)
self.data[i][j] = value
end
Matrix.print = function(self, fmt)
Matrix.print_submatrix = function(self, rows, cols, fmt)
local fmt = fmt or "%4u"
print(string.format("# %dx%d matrix", self.rows, self.cols))
print(string.format("# %dx%d matrix", rows, cols))
io.write("[\n")
for i = 0, self.rows - 1 do
for j = 0, self.cols - 1 do
for i = 0, rows - 1 do
for j = 0, cols - 1 do
io.write(string.format(fmt, self.data[i][j]))
if j < self.cols - 1 then io.write(" ") end
if j < cols - 1 then io.write(" ") end
end
io.write("\n")
end
io.write("]\n")
end
Matrix.print = function(self, fmt)
self:print_submatrix(self.rows, self.cols, fmt)
end
-- "Interleaved" row major is like row major except that
-- elements from `packing_factor` rows are packed together.
--
@ -136,4 +140,12 @@ M.from_row_major_buffer = function(rows, cols, data)
return self
end
Matrix.apply = function(self, func)
for i = 0, self.rows - 1 do
for j = 0, self.cols - 1 do
self.data[i][j] = func(self.data[i][j])
end
end
end
return M

View file

@ -0,0 +1,179 @@
#!/bin/bash
cd "$(dirname "${BASH_SOURCE[0]}")"
if ! command -v executor &> /dev/null; then
echo "ERROR: executor command not found." >&2
exit 1
fi
set -e
executor matmul.lua UB/UD \
<(cat <<EOF
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
EOF
) \
<(cat <<EOF
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
EOF
)
executor matmul.lua HF/F \
<(cat <<EOF
0.25 1.75 2.00 3.50 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00 13.00 14.00 15.00
16.00 17.00 18.00 19.00 20.00 21.00 22.75 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.25 46.00 47.00
48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00
80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
EOF
) \
<(cat <<EOF
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.25 10.00 11.00 12.00 13.00 14.00 15.00
16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.00 46.00 47.00
48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.75 75.00 76.00 77.00 78.00 79.00
80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
128.00 129.0 130.0 131.0 132.0 133.0 134.0 135.0 136.0 137.0 138.0 139.0 140.0 141.0 142.0 143.0
144.00 145.0 146.0 147.0 148.0 149.0 150.0 151.0 152.0 153.0 154.0 155.0 156.0 157.0 158.0 159.0
160.00 161.0 162.0 163.0 164.0 165.0 166.0 167.0 168.0 169.0 170.0 171.0 172.0 173.0 174.0 175.0
176.00 177.0 178.0 179.0 180.0 181.0 182.0 183.0 184.0 185.0 186.0 187.0 188.0 189.0 190.0 191.0
192.00 193.0 194.0 195.0 196.0 197.0 198.0 199.0 200.0 201.0 202.0 203.0 204.0 205.0 206.0 207.0
208.00 209.0 210.0 211.0 212.0 213.0 214.0 215.0 216.0 217.0 218.0 219.0 220.0 221.0 222.0 223.0
224.00 225.0 226.0 227.0 228.0 229.0 230.0 231.0 232.0 233.0 234.0 235.0 236.0 237.0 238.0 239.0
240.00 241.0 242.0 243.0 244.0 245.0 246.0 247.0 248.0 249.0 250.0 251.0 252.0 253.0 254.0 255.0
EOF
)
executor matmul.lua UB/UD \
<(cat <<EOF
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
EOF
) \
<(cat <<EOF
1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1
EOF
)
executor matmul.lua HF/F \
<(cat <<EOF
0.25 1.00 2.75 3.00 4.25
5.00 6.75 7.00 8.50 9.00
10.25 11.00 12.75 13.00 14.25
EOF
) \
<(cat <<EOF
0.0 1.25 2.0 3.75 4.0 5.75
6.0 7.25 8.0 9.50 10.0 11.25
12.0 13.75 14.0 15.75 16.0 17.25
18.0 19.50 20.0 21.25 22.0 23.75
24.0 25.75 26.0 27.25 28.0 29.50
EOF
)
executor matmul.lua UB/UD <(echo "22") <(echo "1")
executor matmul.lua BF/F \
<(cat <<EOF
0.25 1.75 2.00 3.50 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00 13.00 14.00 15.00
16.00 17.00 18.00 19.00 20.00 21.00 22.75 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.25 46.00 47.00
48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00
80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
EOF
) \
<(cat <<EOF
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.25 10.00 11.00 12.00 13.00 14.00 15.00
16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00
32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.00 46.00 47.00
48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00
64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.75 75.00 76.00 77.00 78.00 79.00
80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00
96.00 97.00 98.00 99.00 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0
112.00 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0
128.00 129.0 130.0 131.0 132.0 133.0 134.0 135.0 136.0 137.0 138.0 139.0 140.0 141.0 142.0 143.0
144.00 145.0 146.0 147.0 148.0 149.0 150.0 151.0 152.0 153.0 154.0 155.0 156.0 157.0 158.0 159.0
160.00 161.0 162.0 163.0 164.0 165.0 166.0 167.0 168.0 169.0 170.0 171.0 172.0 173.0 174.0 175.0
176.00 177.0 178.0 179.0 180.0 181.0 182.0 183.0 184.0 185.0 186.0 187.0 188.0 189.0 190.0 191.0
192.00 193.0 194.0 195.0 196.0 197.0 198.0 199.0 200.0 201.0 202.0 203.0 204.0 205.0 206.0 207.0
208.00 209.0 210.0 211.0 212.0 213.0 214.0 215.0 216.0 217.0 218.0 219.0 220.0 221.0 222.0 223.0
224.00 225.0 226.0 227.0 228.0 229.0 230.0 231.0 232.0 233.0 234.0 235.0 236.0 237.0 238.0 239.0
240.00 241.0 242.0 243.0 244.0 245.0 246.0 247.0 248.0 249.0 250.0 251.0 252.0 253.0 254.0 255.0
EOF
)
executor matmul.lua BF/F \
<(cat <<EOF
1.25 2.75 3.00 4.25
5.00 6.75 7.50 8.00
9.25 10.00 11.75 12.25
EOF
) \
<(cat <<EOF
2.0 1.25 3.75 2.0 1.0
1.0 3.00 2.00 1.50 3.0
3.75 2.00 1.00 3.00 2.0
2.0 1.25 3.00 2.00 1.75
EOF
)
echo ""
echo "All tests passed!"