From cf8f79a9fcfcca1e4337ed22c02595516024c22c Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Mon, 14 Sep 2020 13:48:28 -0400
Subject: [PATCH] pan/bi: Add packing generator

From the ISA definition, we can generate a function for each instruction
that looks at the bi_instruction in the intermediate representation and
emits a 20- or 23-bit word (for ADD/FMA respectively) containing that
instruction with all of its modifiers.

These will approximate the old packing routines, although the mapping of
bi_instruction to machine instructions will be hardcoded (at least for
now).

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Daniel Stone <daniels@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6749>
---
 src/panfrost/bifrost/gen_pack.py | 584 +++++++++++++++++++++++++++++++
 1 file changed, 584 insertions(+)
 create mode 100644 src/panfrost/bifrost/gen_pack.py

diff --git a/src/panfrost/bifrost/gen_pack.py b/src/panfrost/bifrost/gen_pack.py
new file mode 100644
index 00000000000..9eb191d2145
--- /dev/null
+++ b/src/panfrost/bifrost/gen_pack.py
@@ -0,0 +1,584 @@
+#
+# Copyright (C) 2020 Collabora, Ltd.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import sys
+from isa_parse import parse_instructions, opname_to_c
+from mako.template import Template
+
+instructions = parse_instructions(sys.argv[1])
+
+# Packs sources into an argument. Offset argument to work around a quirk of our
+# compiler IR when dealing with staging registers (TODO: reorder in the IR to
+# fix this)
+def pack_sources(sources, body, pack_exprs, offset):
+    for i, src in enumerate(sources):
+        body.append('unsigned src{} = bi_get_src(ins, regs, {});'.format(i, i + offset))
+
+        # Validate the source
+        if src[1] != 0xFF:
+            body.append('assert((1 << src{}) & {});'.format(i, hex(src[1])))
+
+        # Sources are state-invariant
+        for state in pack_exprs:
+            state.append('(src{} << {})'.format(i, src[0]))
+
+    body.append('')
+
+# Gets the argument that the source modifier applies to from the name if
+# applicable, otherwise defaults to the first argument
+
+def mod_arg(mod):
+    return int(mod[-1]) if mod[-1] in "0123" else 0
+
+# Widen/lane/swz/swap/replicate modifiers conceptually act as a combined extend
+# + swizzle.  We look at the size of the argument to determine if we apply
+# them, and look at the swizzle to pick which one.
+
+def pack_widen(mod, opts, body, pack_exprs):
+    marg = mod_arg(mod)
+
+    body.append('unsigned {}_sz = nir_alu_type_get_type_size(ins->src_types[{}]);'.format(mod, mod_arg(mod)))
+    body.append('unsigned {}_temp = 0;'.format(mod))
+
+    first = True
+    for i, op in enumerate(opts):
+        if op is None or op == 'reserved':
+            continue
+
+        t_else = 'else ' if not first else ''
+        first = False
+
+        if op in ['none', 'w0']:
+            body.append('{}if ({}_sz == 32) {}_temp = {};'.format(t_else, mod, mod, i))
+        elif op == 'd0':
+            body.append('{}if ({}_sz == 64) {}_temp = {};'.format(t_else, mod, mod, i))
+        else:
+            assert(op[0] in ['h', 'b'])
+            sz = 16 if op[0] == 'h' else 8
+
+            # Condition on the swizzle
+            conds = ['ins->swizzle[{}][{}] == {}'.format(marg, idx, lane) for idx, lane in enumerate(op[1:])]
+            cond = " && ".join(conds)
+
+            body.append('{}if ({}_sz == {} && {}) {}_temp = {};'.format(t_else, mod, sz, cond, mod, i))
+    body.append('else unreachable("Could not pattern match widen");')
+
+    return mod + '_temp'
+
+# abs/neg are stored in ins->src_{abs,neg}[src] arrays
+def pack_absneg(mod, opts, body, pack_exprs):
+    return 'ins->src_{}[{}]'.format(mod[0:-1] if mod[-1] in "0123" else mod, mod_arg(mod))
+
+# ins->roundmode is the native format (RTE/RTP/RTN/RTZ) for most ops. But there
+# are some others we might encounter that we don't support in the IR at this
+# point, and there are a few that force a subset of round modes.
+
+def pack_round(mod, opts, body, pack_exprs):
+    if opts == ['none', 'rtz']:
+        body.append('assert(ins->roundmode == BIFROST_RTE || ins->roundmode == BIFROST_RTZ);')
+        return '(ins->roundmode == BIFROST_RTZ) ? 1 : 0'
+    elif opts == ['rtn', 'rtp']:
+        body.append('assert(ins->roundmode == BIFROST_RTN || ins->roundmode == BIFROST_RTP);')
+        return '(ins->roundmode == BIFROST_RTP) ? 1 : 0'
+    elif opts[0:4] == ['none', 'rtp', 'rtn', 'rtz']:
+        return 'ins->roundmode'
+    else:
+        assert False
+
+# Likewise, matches our native format
+
+def pack_clamp(mod, opts, body, pack_exprs):
+    if opts == ['none', 'clamp_0_inf', 'clamp_m1_1', 'clamp_0_1']:
+        return 'ins->outmod'
+    elif opts == ['none', 'clamp_0_1']:
+        body.append('assert(ins->outmod == BIFROST_NONE || ins->outmod == BIFROST_SAT);')
+        return '(ins->outmod == BIFROST_SAT) ? 1 : 0'
+    else:
+        assert False
+
+# Our modifiers match up in name, but there is no shortage of orders. So just
+# emit a table on the fly for it, since you won't get something much better.
+# ENUM_BI_COND must be kept synced with `enum bi_cond` in compiler.h
+
+ENUM_BI_COND = [
+        "al",
+        "lt",
+        "le",
+        "ge",
+        "gt",
+        "eq",
+        "ne",
+]
+
+def pack_cmpf(mod, opts, body, pack_exprs):
+    # Generate a table mapping ENUM_BI_COND to opts, or an invalid
+    # sentintel if not used (which will then be asserted out in a debug build).
+    table = [str(opts.index(x)) if x in opts else '~0' for x in ENUM_BI_COND]
+
+    body.append('unsigned cmpf_table[] = {')
+    body.append('    ' + ', '.join(table))
+    body.append('};')
+
+    return 'cmpf_table[ins->cond]'
+
+# Since our IR is explicitly typed, we look at the size/sign to determine sign
+# extension behaviour
+def pack_extend(mod, opts, body, pack_exprs):
+    body.append('ASSERTED bool {}_small = nir_alu_type_get_type_size(ins->src_types[{}]) <= 16;'.format(mod, mod_arg(mod)))
+    body.append('bool {}_signed = nir_alu_type_get_base_type(ins->src_types[{}]) == nir_type_int;'.format(mod, mod_arg(mod)))
+    
+    if opts == ['none', 'sext', 'zext', 'reserved']:
+        return '{}_small ? ({}_signed ? 1 : 2) : 0'.format(mod, mod)
+    else:
+        assert opts == ['zext', 'sext']
+        body.append('assert({}_small);'.format(mod))
+        return '{}_signed ? 1 : 0'.format(mod)
+
+# Packs special varying loads. Our BIFROST_FRAGZ etc defines match the hw in
+# the bottom two bits (TODO drop upper bits)
+def pack_varying_name(mod, opts, body, pack_exprs):
+    assert(opts[0] == 'point' and opts[2] == 'frag_w' and opts[3] == 'frag_z')
+    return 'ins->constant.u64 & 0x3'
+
+def pack_not_src1(mod, opts, body, pack_exprs):
+    return 'ins->bitwise.src1_invert ? {} : {}'.format(opts.index('not'), opts.index('none'))
+
+def pack_not_result(mod, opts, body, pack_exprs):
+    return 'ins->bitwise.dest_invert ? {} : {}'.format(opts.index('not'), opts.index('none'))
+
+REGISTER_FORMATS = {
+    'f64': 'nir_type_float64',
+    'f32': 'nir_type_float32',
+    'f16': 'nir_type_float16',
+    'u64': 'nir_type_uint64',
+    'u32': 'nir_type_uint32',
+    'u16': 'nir_type_uint16',
+    'i64': 'nir_type_int64',
+    's32': 'nir_type_int32',
+    's16': 'nir_type_int16'
+}
+
+def pack_register_format(mod, opts, body, pack_exprs):
+    body.append('unsigned {}_temp = 0;'.format(mod))
+
+    first = True
+    for i, op in enumerate(opts):
+        if op is None or op == 'reserved':
+            continue
+
+        t_else = 'else ' if not first else ''
+        first = False
+        nir_type = REGISTER_FORMATS.get(op)
+
+        if nir_type:
+            body.append('{}if (ins->format == {}) {}_temp = {};'.format(t_else, nir_type, mod, i))
+
+    assert not first
+    body.append('else unreachable("Could not pattern match register format");')
+    return mod + '_temp'
+
+def pack_seg(mod, opts, body, pack_exprs):
+    if len(opts) == 8:
+        body.append('assert(ins->segment);')
+        return 'ins->segment'
+    elif opts == ['none', 'wgl']:
+        body.append('assert(ins->segment == BI_SEGMENT_NONE || ins->segment == BI_SEGMENT_WLS);')
+        return 'ins->segment == BI_SEGMENT_WLS ? 1 : 0'
+    else:
+        assert(False)
+
+# TODO: Update modes (perf / slow) For now just force store, except for special
+# varyings for which we force clobber
+def pack_update(mod, opts, body, pack_exprs):
+    if opts == ['store', 'retrieve', 'conditional', 'clobber']:
+        return '(ins->constant.u64 >= 20) ? 3 : 0'
+    else:
+        assert(opts[0] == 'store')
+        return '0'
+
+# Processes modifiers. If used directly, emits a pack. Otherwise, just
+# processes the value (grabbing it from the IR). This must sync with the IR.
+
+modifier_map = {
+        "widen": pack_widen,
+        "widen0": pack_widen,
+        "widen1": pack_widen,
+        "lane": pack_widen,
+        "lane0": pack_widen,
+        "lane1": pack_widen,
+        "lane2": pack_widen,
+        "lane3": pack_widen,
+        "lanes0": pack_widen,
+        "lanes1": pack_widen,
+        "lanes2": pack_widen,
+        "swz": pack_widen,
+        "swz0": pack_widen,
+        "swz1": pack_widen,
+        "swz2": pack_widen,
+        "swap0": pack_widen,
+        "swap1": pack_widen,
+        "swap2": pack_widen,
+        "replicate0": pack_widen,
+        "replicate1": pack_widen,
+
+        "abs": pack_absneg,
+        "abs0": pack_absneg,
+        "abs1": pack_absneg,
+        "abs2": pack_absneg,
+        "neg": pack_absneg,
+        "neg0": pack_absneg,
+        "neg1": pack_absneg,
+        "neg2": pack_absneg,
+
+        "extend": pack_extend,
+        "extend0": pack_extend,
+        "extend1": pack_extend,
+        "extend2": pack_extend,
+        "sign0": pack_extend,
+        "sign1": pack_extend,
+
+        "clamp": pack_clamp,
+        "round": pack_round,
+        "cmpf": pack_cmpf,
+        "varying_name": pack_varying_name,
+        "not1": pack_not_src1,
+        "not_result": pack_not_result,
+        "register_format": pack_register_format,
+        "seg": pack_seg,
+        "update": pack_update,
+
+        # Just a minus one modifier
+        "vecsize": lambda a,b,c,d: 'ins->vector_channels - 1',
+
+        # 0: compute 1: zero
+        "lod_mode": lambda a,b,c,d: '1 - ins->texture.compute_lod',
+
+        # Not much choice in the matter...
+        "divzero": lambda a,b,c,d: '0',
+        "sem": lambda a,b,c,d: '0', # IEEE 754 compliant NaN rules
+
+        # We don't support these in the IR yet (TODO)
+        "saturate": lambda a,b,c,d: '0', # clamp to min/max int
+        "mask": lambda a,b,c,d: '0', # clz(~0) = ~0
+        "result_type": lambda a,opts,c,d: str(opts.index('m1')), # #1, #1.0, ~0 for cmp
+        "special": lambda a,b,c,d: '0', # none, which source wins..
+        "offset": lambda a,b,c,d: '0', # sin/cos thing
+        "adj": lambda a,b,c,d: '0', # sin/cos thing
+        "sqrt": lambda a,b,c,d: '0', # sin/cos thing
+        "log": lambda a,b,c,d: '1', # frexpe mode -- TODO: other transcendentals for g71
+        "scale": lambda a,b,c,d: '0', # sin/cos thing
+        "precision": lambda a,b,c,d: '0', # log thing
+        "mode": lambda a,b,c,d: '0', # log thing
+        "func": lambda a,b,c,d: '0', # pow special case thing
+        "h": lambda a,b,c,d: '0', # VN_ASST1.f16
+        "l": lambda a,b,c,d: '0', # VN_ASST1.f16
+        "sample": lambda a,b,c,d: '0', # LD_VAR center
+        "function": lambda a,b,c,d: '3', # LD_VAR_FLAT none
+        "preserve_null": lambda a,b,c,d: '0', # SEG_ADD none
+        "skip": lambda a,b,c,d: '0', # texturing (no skip)
+        "bytes2": lambda a,b,c,d: '0', # NIR shifts are in bits
+        "result_word": lambda a,b,c,d: '0', # 32-bit only shifts for now (TODO)
+        "source": lambda a,b,c,d: '7', # cycle_counter for LD_GCLK
+        "lane_op": lambda a,b,c,d: '0', # CLPER none
+        "subgroup": lambda a,b,c,d: '1', # CLPER subgroup4
+        "inactive_result": lambda a,b,c,d: '0', # CLPER zero
+        "threads": lambda a,b,c,d: '0', # IMULD odd
+        "stencil": lambda a,b,c,d: '1', # ZS_EMIT stencil
+        "z": lambda a,b,c,d: '1', # ZS_EMIT z
+        "combine": lambda a,b,c,d: '0', # BRANCHC any
+        "format": lambda a,b,c,d: '1', # LEA_TEX_IMM u32
+        "test_mode": lambda a,b,c,d: '0', # JUMP_EX z
+        "stack_mode": lambda a,b,c,d: '2', # JUMP_EX none
+        "atom_opc": lambda a,b,c,d: '2', # ATOM_C aadd
+        "mux": lambda a,b,c,d: '1', # MUX int_zero
+}
+
+def pack_modifier(mod, width, default, opts, body, pack_exprs):
+    # Invoke the specific one
+    fn = modifier_map.get(mod)
+
+    if fn is None:
+        return None
+
+    expr = fn(mod, opts, body, pack_exprs)
+    body.append('unsigned {} = {};'.format(mod, expr))
+
+    # Validate we don't overflow
+    try:
+        assert(int(expr) < (1 << width))
+    except:
+        body.append('assert({} < {});'.format(mod, (1 << width)))
+
+    body.append('')
+
+    return True
+
+# Compiles an S-expression (and/or/eq/neq, modifiers, `ordering`, immediates)
+# into a C boolean expression suitable to stick in an if-statement. Takes an
+# imm_map to map modifiers to immediate values, parametrized by the ctx that
+# we're looking up in (the first, non-immediate argument of the equality)
+
+SEXPR_BINARY = {
+        "and": "&&",
+        "or": "||",
+        "eq": "==",
+        "neq": "!="
+}
+
+def compile_s_expr(expr, imm_map, ctx):
+    if expr[0] == 'alias':
+        return compile_s_expr(expr[1], imm_map, ctx)
+    elif expr == ['eq', 'ordering', '#gt']:
+        return '(src0 > src1)'
+    elif expr == ['neq', 'ordering', '#lt']:
+        return '(src0 >= src1)'
+    elif expr == ['neq', 'ordering', '#gt']:
+        return '(src0 <= src1)'
+    elif expr == ['eq', 'ordering', '#lt']:
+        return '(src0 < src1)'
+    elif expr == ['eq', 'ordering', '#eq']:
+        return '(src0 == src1)'
+    elif isinstance(expr, list):
+        sep = " {} ".format(SEXPR_BINARY[expr[0]])
+        return "(" + sep.join([compile_s_expr(s, imm_map, expr[1]) for s in expr[1:]]) + ")"
+    elif expr[0] == '#':
+        return str(imm_map[ctx][expr[1:]])
+    else:
+        return expr
+
+# Packs a derived value. We just iterate through the possible choices and test
+# whether the encoding matches, and if so we use it.
+
+def pack_derived(pos, exprs, imm_map, body, pack_exprs):
+    body.append('unsigned derived_{} = 0;'.format(pos))
+
+    first = True
+    for i, expr in enumerate(exprs):
+        if expr is not None:
+            cond = compile_s_expr(expr, imm_map, None)
+            body.append('{}if {} derived_{} = {};'.format('' if first else 'else ', cond, pos, i))
+            first = False
+
+    assert (not first)
+    body.append('else unreachable("No pattern match at pos {}");'.format(pos))
+    body.append('')
+
+    assert(pos is not None)
+    pack_exprs.append('(derived_{} << {})'.format(pos, pos))
+
+# Table mapping immediate names in the machine to expressions of `ins` to
+# lookup the value in the IR, performing adjustments as needed
+
+IMMEDIATE_TABLE = {
+        'attribute_index': 'bi_get_immediate(ins, 0)',
+        'varying_index': 'bi_get_immediate(ins, 0)',
+        'index': 'bi_get_immediate(ins, 0)',
+        'image_index': 'ins->texture.texture_index',
+        'sampler_index': 'ins->texture.sampler_index',
+        'table': '63', # Bindless (flat addressing) mode for DTSEL_IMM
+
+        # Not supported in the IR (TODO)
+        'shift': '0',
+        'fill': '0', # WMASK
+}
+
+# Generates a routine to pack a single variant of a single- instruction.
+# Template applies the needed formatting and combine to OR together all the
+# pack_exprs to avoid bit fields.
+#
+# Argument swapping is sensitive to the order of operations. Dependencies:
+# sources (RW), modifiers (RW), derived values (W). Hence we emit sources and
+# modifiers first, then perform a swap if necessary overwriting
+# sources/modifiers, and last calculate derived values and pack.
+
+variant_template = Template("""static inline unsigned
+pan_pack_${name}(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
+{
+${"\\n".join([("    " + x) for x in common_body])}
+% if single_state:
+% for (pack_exprs, s_body, _) in states:
+${"\\n".join(["    " + x for x in s_body + ["return {};".format( " | ".join(pack_exprs))]])}
+% endfor
+% else:
+% for i, (pack_exprs, s_body, cond) in enumerate(states):
+    ${'} else ' if i > 0 else ''}if ${cond} {
+${"\\n".join(["        " + x for x in s_body + ["return {};".format(" | ".join(pack_exprs))]])}
+% endfor
+    } else {
+        unreachable("No matching state found in ${name}");
+    }
+% endif
+}
+""")
+
+def pack_variant(opname, states):
+    # Expressions to be ORed together for the final pack, an array per state
+    pack_exprs = [[hex(state[1]["exact"][1])] for state in states]
+
+    # Computations which need to be done to encode first, across states
+    common_body = []
+
+    # Map from modifier names to a map from modifier values to encoded values
+    # String -> { String -> Uint }. This can be shared across states since
+    # modifiers are (except the pos values) constant across state.
+    imm_map = {}
+
+    # Pack sources. Offset over to deal with staging/immediate weirdness in our
+    # IR (TODO: reorder sources upstream so this goes away). Note sources are
+    # constant across states.
+    staging = states[0][1].get("staging", "")
+    offset = 0
+    if staging in ["r", "rw"]:
+        offset += 1
+
+    offset += len(set(["attribute_index", "varying_index", "index"]) & set([x[0] for x in states[0][1].get("immediates", [])]))
+
+    if opname == '+LD_VAR_SPECIAL':
+        offset += 1
+
+    pack_sources(states[0][1].get("srcs", []), common_body, pack_exprs, offset)
+
+    modifiers_handled = []
+    for st in states:
+        for ((mod, _, width), default, opts) in st[1].get("modifiers", []):
+            if mod in modifiers_handled:
+                continue
+
+            modifiers_handled.append(mod)
+
+            if pack_modifier(mod, width, default, opts, common_body, pack_exprs) is None:
+                return None
+
+            imm_map[mod] = { x: y for y, x in enumerate(opts) }
+
+    for i, st in enumerate(states):
+        for ((mod, pos, width), default, opts) in st[1].get("modifiers", []):
+            if pos is not None:
+                pack_exprs[i].append('({} << {})'.format(mod, pos))
+
+    for ((src_a, src_b), cond, remap) in st[1].get("swaps", []):
+        # Figure out which vars to swap, in order to swap the arguments. This
+        # always includes the sources themselves, and may include source
+        # modifiers (with the same source indices). We swap based on which
+        # matches A, this is arbitrary but if we swapped both nothing would end
+        # up swapping at all since it would swap back.
+
+        vars_to_swap = ['src']
+        for ((mod, _, width), default, opts) in st[1].get("modifiers", []):
+            if mod[-1] in str(src_a):
+                vars_to_swap.append(mod[0:-1])
+
+        common_body.append('if {}'.format(compile_s_expr(cond, imm_map, None)) + ' {')
+
+        # Emit the swaps. We use a temp, and wrap in a block to avoid naming
+        # collisions with multiple swaps. {{Doubling}} to escape the format.
+
+        for v in vars_to_swap:
+            common_body.append('    {{ unsigned temp = {}{}; {}{} = {}{}; {}{} = temp; }}'.format(v, src_a, v, src_a, v, src_b, v, src_b))
+
+        # Also, remap. Bidrectional swaps are explicit in the XML.
+        for v in remap:
+            maps = remap[v]
+            imm = imm_map[v]
+
+            for i, l in enumerate(maps):
+                common_body.append('    {}if ({} == {}) {} = {};'.format('' if i == 0 else 'else ', v, imm[l], v, imm[maps[l]]))
+
+        common_body.append('}')
+        common_body.append('')
+
+    for (name, pos, width) in st[1].get("immediates", []):
+        if name not in IMMEDIATE_TABLE:
+            return None
+
+        common_body.append('unsigned {} = {};'.format(name, IMMEDIATE_TABLE[name]))
+
+        for st in pack_exprs:
+            st.append('({} << {})'.format(name, pos))
+
+    if staging == 'r':
+        common_body.append('bi_read_data_register(clause, ins);')
+    elif staging == 'w':
+        common_body.append('bi_write_data_register(clause, ins);')
+    elif staging == '':
+        pass
+    else:
+        assert staging == 'rw'
+        # XXX: register allocation requirement (!)
+        common_body.append('bi_read_data_register(clause, ins);')
+        common_body.append('assert(ins->src[0] == ins->dest);')
+
+    # After this, we have to branch off, since deriveds *do* vary based on state.
+    state_body = [[] for s in states]
+
+    for i, (_, st) in enumerate(states):
+        for ((pos, width), exprs) in st.get("derived", []):
+            pack_derived(pos, exprs, imm_map, state_body[i], pack_exprs[i])
+
+    # How do we pick a state? Accumulate the conditions
+    state_conds = [compile_s_expr(st[0], imm_map, None) for st in states] if len(states) > 1 else [None]
+
+    if state_conds == None:
+        assert (states[0][0] == None)
+
+    # Finally, we'll collect everything together
+    return variant_template.render(name = opname_to_c(opname), states = zip(pack_exprs, state_body, state_conds), common_body = common_body, single_state = (len(states) == 1))
+
+HEADER = """/*
+ * Copyright (C) 2020 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Autogenerated file, do not edit */
+
+#ifndef _BI_GENERATED_PACK_H
+#define _BI_GENERATED_PACK_H
+
+#include "compiler.h"
+#include "bi_pack_helpers.h"
+"""
+
+print(HEADER)
+
+packs = [pack_variant(e, instructions[e]) for e in instructions]
+for p in packs:
+    print(p)
+
+print("#endif")