glthread: generate packed versions of gl*Pointer/Offset calls

The pointer/offset parameter is often NULL or a small number with VBOs. The idea is: - If the pointer/offset parameter is NULL/0, use a different cmd structure and unmarshal function that doesn't contain the pointer/offset parameter to save 8 bytes per call. - If the cmd structure has a hole and the pointer/offset parameter is a small number that fits into the hole, use a different cmd structure and unmarshal function that stores the value within the hole using a smaller type to save 8 bytes per call. This implements those ideas. It will continue generating the most optimal code even if we change the packing of other parameters. This decreases the size of 1 frame in glthread batches by 21% in Viewperf2020/Catia1. Example of generated code for glVertexPointer with and without the pointer parameter if it's NULL. See the arrows for comments. /* VertexPointer: marshalled asynchronously */ struct marshal_cmd_VertexPointer { struct marshal_cmd_base cmd_base; GLpacked16i size; GLenum16 type; GLclamped16i stride; const GLvoid * pointer; }; struct marshal_cmd_VertexPointer_packed { struct marshal_cmd_base cmd_base; GLpacked16i size; GLenum16 type; GLclamped16i stride; // <------- no "pointer" }; uint32_t _mesa_unmarshal_VertexPointer(struct gl_context *ctx, const struct marshal_cmd_VertexPointer *restrict cmd) { GLpacked16i size = cmd->size; GLenum16 type = cmd->type; GLclamped16i stride = cmd->stride; const GLvoid * pointer = cmd->pointer; CALL_VertexPointer(ctx->Dispatch.Current, (size, type, stride, pointer)); return align(sizeof(struct marshal_cmd_VertexPointer), 8) / 8; } uint32_t _mesa_unmarshal_VertexPointer_packed(struct gl_context *ctx, const struct marshal_cmd_VertexPointer_packed *restrict cmd) { GLpacked16i size = cmd->size; GLenum16 type = cmd->type; GLclamped16i stride = cmd->stride; const GLvoid * pointer = (const GLvoid *)(uintptr_t)0; // <------- using NULL CALL_VertexPointer(ctx->Dispatch.Current, (size, type, stride, pointer)); return align(sizeof(struct marshal_cmd_VertexPointer_packed), 8) / 8; } static void GLAPIENTRY _mesa_marshal_VertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) { GET_CURRENT_CONTEXT(ctx); if (!pointer) { // <------- the condition int cmd_size = sizeof(struct marshal_cmd_VertexPointer_packed); struct marshal_cmd_VertexPointer_packed *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_VertexPointer_packed, cmd_size); cmd->size = size < 0 ? UINT16_MAX : MIN2(size, UINT16_MAX); cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */ cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX); } else { int cmd_size = sizeof(struct marshal_cmd_VertexPointer); struct marshal_cmd_VertexPointer *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_VertexPointer, cmd_size); cmd->size = size < 0 ? UINT16_MAX : MIN2(size, UINT16_MAX); cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */ cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX); cmd->pointer = pointer; } _mesa_glthread_AttribPointer(ctx, VERT_ATTRIB_POS, MESA_PACK_VFORMAT(type, size, 0, 0, 0), stride, pointer); } Example of generated code for glNormalPointer using a smaller type: /* NormalPointer: marshalled asynchronously */ struct marshal_cmd_NormalPointer { struct marshal_cmd_base cmd_base; GLenum16 type; GLclamped16i stride; const GLvoid * pointer; }; struct marshal_cmd_NormalPointer_packed { struct marshal_cmd_base cmd_base; GLenum16 type; GLclamped16i stride; GLushort pointer; // <-------- truncated "pointer" }; uint32_t _mesa_unmarshal_NormalPointer(struct gl_context *ctx, const struct marshal_cmd_NormalPointer *restrict cmd) { GLenum16 type = cmd->type; GLclamped16i stride = cmd->stride; const GLvoid * pointer = cmd->pointer; CALL_NormalPointer(ctx->Dispatch.Current, (type, stride, pointer)); return align(sizeof(struct marshal_cmd_NormalPointer), 8) / 8; } uint32_t _mesa_unmarshal_NormalPointer_packed(struct gl_context *ctx, const struct marshal_cmd_NormalPointer_packed *restrict cmd) { GLenum16 type = cmd->type; GLclamped16i stride = cmd->stride; const GLvoid * pointer = (const GLvoid *)(uintptr_t)cmd->pointer; // <-------- upcasting CALL_NormalPointer(ctx->Dispatch.Current, (type, stride, pointer)); return align(sizeof(struct marshal_cmd_NormalPointer_packed), 8) / 8; } static void GLAPIENTRY _mesa_marshal_NormalPointer(GLenum type, GLsizei stride, const GLvoid *pointer) { GET_CURRENT_CONTEXT(ctx); if (((uintptr_t)pointer & 0xffff) == (uintptr_t)pointer) { // <-------- the condition int cmd_size = sizeof(struct marshal_cmd_NormalPointer_packed); struct marshal_cmd_NormalPointer_packed *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_NormalPointer_packed, cmd_size); cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */ cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX); cmd->pointer = (uintptr_t)pointer; /* truncated */ // <-------- the truncation } else { int cmd_size = sizeof(struct marshal_cmd_NormalPointer); struct marshal_cmd_NormalPointer *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_NormalPointer, cmd_size); cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */ cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX); cmd->pointer = pointer; } _mesa_glthread_AttribPointer(ctx, VERT_ATTRIB_NORMAL, MESA_PACK_VFORMAT(type, 3, 1, 0, 0), stride, pointer); } Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27350>
2026-05-05 13:58:04 +02:00 · 2024-01-08 20:59:37 -05:00 · 2024-01-08 20:59:37 -05:00 · c9abb7ff6e
commit c9abb7ff6e
parent 24f14f8daa
4 changed files with 141 additions and 23 deletions
--- a/src/mapi/glapi/gen/gl_marshal.py
+++ b/src/mapi/glapi/gen/gl_marshal.py
@ -103,19 +103,23 @@ class PrintCode(gl_XML.gl_print_base):
        out('')
        out('')

-    def print_unmarshal_func(self, func):
-        out('uint32_t')
-        out(('_mesa_unmarshal_{0}(struct gl_context *ctx, '
-             'const struct marshal_cmd_{0} *restrict cmd)').format(func.name))
+    def print_unmarshal_func(self, func, is_packed=False):
+        func.print_unmarshal_prototype(is_packed=is_packed)
        out('{')
        with indent():
            for p in func.fixed_params:
+                type = func.get_marshal_type(p)
+
                if p.count:
                    p_decl = '{0} *{1} = cmd->{1};'.format(
                            p.get_base_type_string(), p.name)
+                elif is_packed and func.packed_param_name == p.name:
+                    if func.packed_param_size == 0:
+                        p_decl = '{0} {1} = ({0})(uintptr_t)0;'.format(type, p.name)
+                    else:
+                        p_decl = '{0} {1} = ({0})(uintptr_t)cmd->{1};'.format(type, p.name)
                else:
-                    p_decl = '{0} {1} = cmd->{1};'.format(
-                            func.get_marshal_type(p), p.name)
+                    p_decl = '{0} {1} = cmd->{1};'.format(type, p.name)

                if not p_decl.startswith('const ') and p.count:
                    # Declare all local function variables as const, even if
@ -150,12 +154,14 @@ class PrintCode(gl_XML.gl_print_base):
            if func.variable_params:
                out('return cmd->num_slots;')
            else:
-                struct = 'struct marshal_cmd_{0}'.format(func.name)
-                out('return align(sizeof({0}), 8) / 8;'.format(struct))
+                out('return align(sizeof({0}), 8) / 8;'.format(func.get_marshal_struct_name(is_packed)))
        out('}')

-    def print_marshal_async_code(self, func):
-        struct = 'struct marshal_cmd_{0}'.format(func.name)
+        if not is_packed and func.packed_fixed_params:
+            self.print_unmarshal_func(func, is_packed=True)
+
+    def print_marshal_async_code(self, func, is_packed=False):
+        struct = func.get_marshal_struct_name(is_packed)

        if func.marshal_sync:
            out('int cmd_size = sizeof({0});'.format(struct))
@ -205,17 +211,24 @@ class PrintCode(gl_XML.gl_print_base):
                out('assert(cmd_size >= 0 && cmd_size <= MARSHAL_MAX_CMD_SIZE);')

        # Add the call into the batch.
-        out('{0} *cmd = _mesa_glthread_allocate_command(ctx, '
-            'DISPATCH_CMD_{1}, cmd_size);'.format(struct, func.name))
+        dispatch_cmd = 'DISPATCH_CMD_{0}{1}'.format(func.name, '_packed' if is_packed else '')
+        if func.get_fixed_params(is_packed) or func.variable_params:
+            out('{0} *cmd = _mesa_glthread_allocate_command(ctx, {1}, cmd_size);'
+                .format(struct, dispatch_cmd))
+        else:
+            out('_mesa_glthread_allocate_command(ctx, {0}, cmd_size);'.format(dispatch_cmd))
+
        if func.variable_params:
            out('cmd->num_slots = align(cmd_size, 8) / 8;')

-        for p in func.fixed_params:
+        for p in func.get_fixed_params(is_packed):
            type = func.get_marshal_type(p)

            if p.count:
                out('memcpy(cmd->{0}, {0}, {1});'.format(
                        p.name, p.size_string()))
+            elif is_packed and p.name == func.packed_param_name:
+                out('cmd->{0} = (uintptr_t){0}; /* truncated */'.format(p.name))
            elif type == 'GLenum8':
                out('cmd->{0} = MIN2({0}, 0xff); /* clamped to 0xff (invalid enum) */'.format(p.name))
            elif type == 'GLenum16':
@ -226,6 +239,7 @@ class PrintCode(gl_XML.gl_print_base):
                out('cmd->{0} = {0} < 0 ? UINT16_MAX : MIN2({0}, UINT16_MAX);'.format(p.name))
            else:
                out('cmd->{0} = {0};'.format(p.name))
+
        if func.variable_params:
            out('char *variable_data = (char *) (cmd + 1);')
            i = 1
@ -244,9 +258,6 @@ class PrintCode(gl_XML.gl_print_base):
                        out('variable_data += {0}_size;'.format(p.name))
                i += 1

-        if not func.fixed_params and not func.variable_params:
-            out('(void) cmd;')
-
    def print_async_body(self, func):
        out('/* {0}: marshalled asynchronously */'.format(func.name))
        func.print_struct()
@ -261,7 +272,22 @@ class PrintCode(gl_XML.gl_print_base):
            if func.marshal_call_before:
                out(func.marshal_call_before);

-            self.print_marshal_async_code(func)
+            if func.packed_fixed_params:
+                if func.packed_param_size > 0:
+                    out('if (((uintptr_t){0} & 0x{1}) == (uintptr_t){0}) {{'
+                        .format(func.packed_param_name,
+                                'ff' * func.packed_param_size))
+                else:
+                    out('if (!{0}) {{'.format(func.packed_param_name))
+
+                with indent():
+                    self.print_marshal_async_code(func, is_packed=True)
+                out('} else {')
+                with indent():
+                    self.print_marshal_async_code(func)
+                out('}')
+            else:
+                self.print_marshal_async_code(func)

            if func.marshal_call_after:
                out(func.marshal_call_after)
--- a/src/mapi/glapi/gen/gl_marshal_h.py
+++ b/src/mapi/glapi/gen/gl_marshal_h.py
@ -61,6 +61,8 @@ class PrintCode(gl_XML.gl_print_base):
            if flavor in ('skip', 'sync'):
                continue
            print('   DISPATCH_CMD_{0},'.format(func.name))
+            if func.packed_fixed_params:
+                print('   DISPATCH_CMD_{0}_packed,'.format(func.name))
        print('   NUM_DISPATCH_CMD,')
        print('};')
        print('')
@ -71,8 +73,9 @@ class PrintCode(gl_XML.gl_print_base):
            flavor = func.marshal_flavor()

            if flavor in ('custom', 'async'):
-                print(('uint32_t _mesa_unmarshal_{0}(struct gl_context *ctx, '
-                       'const struct marshal_cmd_{0} *restrict cmd);').format(func.name))
+                func.print_unmarshal_prototype(suffix=';')
+                if func.packed_fixed_params:
+                    func.print_unmarshal_prototype(suffix=';', is_packed=True)

            if flavor in ('custom', 'async', 'sync') and not func.marshal_is_static():
                print('{0} GLAPIENTRY _mesa_marshal_{1}({2});'.format(func.return_type, func.name, func.get_parameter_string()))
--- a/src/mapi/glapi/gen/gl_unmarshal_table.py
+++ b/src/mapi/glapi/gen/gl_unmarshal_table.py
@ -68,6 +68,8 @@ class PrintCode(gl_XML.gl_print_base):
                if func.marshal_flavor() in ('skip', 'sync'):
                    continue
                out('[DISPATCH_CMD_{0}] = (_mesa_unmarshal_func)_mesa_unmarshal_{0},'.format(func.name))
+                if func.packed_fixed_params:
+                    out('[DISPATCH_CMD_{0}_packed] = (_mesa_unmarshal_func)_mesa_unmarshal_{0}_packed,'.format(func.name))
        out('};')

        # Print the string table of function names.
@ -78,6 +80,8 @@ class PrintCode(gl_XML.gl_print_base):
                if func.marshal_flavor() in ('skip', 'sync'):
                    continue
                out('[DISPATCH_CMD_{0}] = "{0}",'.format(func.name))
+                if func.packed_fixed_params:
+                    out('[DISPATCH_CMD_{0}_packed] = "{0}_packed",'.format(func.name))
        out('};')


--- a/src/mapi/glapi/gen/marshal_XML.py
+++ b/src/mapi/glapi/gen/marshal_XML.py
@ -25,6 +25,11 @@

 import gl_XML
 import sys
+import copy
+import typeexpr
+
+def pot_align(base, pot_alignment):
+    return (base + pot_alignment - 1) & ~(pot_alignment - 1);


 class marshal_item_factory(gl_XML.gl_item_factory):
@ -163,6 +168,74 @@ class marshal_function(gl_XML.gl_function):
        # from smallest to biggest.
        self.fixed_params = sorted(self.fixed_params, key=lambda p: self.get_type_size(p))

+        # Compute the marshal structure size and the largest hole
+        self.struct_size = 2 # sizeof(struct marshal_cmd_base)
+        largest_hole = 0
+
+        for p in self.fixed_params:
+            type_size = self.get_type_size(p)
+            aligned_size = pot_align(self.struct_size, type_size)
+            largest_hole = max(aligned_size - self.struct_size, largest_hole)
+            self.struct_size = aligned_size
+            self.struct_size = self.struct_size + type_size
+
+        # Round down largest_hole to a power of two.
+        largest_hole = int(2 ** (largest_hole.bit_length() - 1))
+
+        # Align the structure to 8 bytes.
+        aligned_size = pot_align(self.struct_size, 8)
+        padding_hole = aligned_size - self.struct_size
+        self.struct_size = aligned_size
+
+        # Determine whether to generate a packed version of gl*Pointer calls.
+        # If there is a hole in the cmd structure, the pointer/offset parameter
+        # can be truncated and stored in the hole to save 8 bytes per call.
+        # The version of the structure is determined at runtime based on
+        # whether the truncation doesn't change the value. This is common with
+        # VBOs because the pointer/offset is usually small.
+        #
+        # If there is no hole, the packed version completely removes
+        # the pointer/offset parameter and is used when the value is NULL/0
+        # to remove 8 bytes per call. This is common with VBOs.
+        self.packed_param_name = None
+
+        if (self.is_vertex_pointer_call and
+            # 32-bit CPUs only benefit if we remove the whole 8-byte slot,
+            # which means there must be exactly 4-byte padding after the 4-byte
+            # pointer/offset parameter.
+            (self.context.pointer_size != 4 or padding_hole == 4)):
+            for pname in ['pointer', 'offset']:
+                if pname in [p.name for p in self.fixed_params]:
+                    self.packed_param_name = pname
+
+            assert self.packed_param_name
+            assert not self.variable_params
+            assert not self.marshal_sync
+
+        # Prepare the parameters for the packed version by replacing the type
+        # of the packed variable or removing it completely.
+        self.packed_fixed_params = []
+        if self.packed_param_name:
+            for p in self.fixed_params:
+                if p.name == self.packed_param_name:
+                    if largest_hole > 0:
+                        # Select the truncated type.
+                        type = ['GLubyte', 'GLushort', 'GLuint'][largest_hole.bit_length() - 1]
+
+                        # Clone the parameter and change its type
+                        new_param = copy.deepcopy(p)
+                        new_param.type_expr = typeexpr.type_expression(type, self.context)
+                        self.packed_fixed_params.append(new_param)
+                else:
+                    self.packed_fixed_params.append(p)
+            self.packed_param_size = largest_hole
+        # Sort the parameters by size to move the truncated type into the hole.
+        self.packed_fixed_params = sorted(self.packed_fixed_params, key=lambda p: self.get_type_size(p))
+
+
+    def get_fixed_params(self, is_packed):
+        return self.packed_fixed_params if is_packed else self.fixed_params
+
    def marshal_flavor(self):
        """Find out how this function should be marshalled between
        client and server threads."""
@ -195,15 +268,15 @@ class marshal_function(gl_XML.gl_function):
                self.name[0:8] != 'Internal' and
                self.exec_flavor != 'beginend')

-    def print_struct(self, is_header=False):
+    def print_struct(self, is_header=False, is_packed=False):
        if (self.marshal_struct == 'public') == is_header:
-            print('struct marshal_cmd_{0}'.format(self.name))
+            print(self.get_marshal_struct_name(is_packed))
            print('{')
            print('   struct marshal_cmd_base cmd_base;')
            if self.variable_params:
                print('   uint16_t num_slots;')

-            for p in self.fixed_params:
+            for p in self.get_fixed_params(is_packed):
                if p.count:
                    print('   {0} {1}[{2}];'.format(
                            p.get_base_type_string(), p.name, p.count))
@ -228,4 +301,16 @@ class marshal_function(gl_XML.gl_function):
                            p.name, p.counter))
            print('};')
        elif self.marshal_flavor() in ('custom', 'async'):
-            print('struct marshal_cmd_{0};'.format(self.name))
+            print('{0};'.format(self.get_marshal_struct_name(is_packed)))
+
+        if not is_packed and self.packed_fixed_params:
+            self.print_struct(is_header, True)
+
+    def get_marshal_struct_name(self, is_packed=False):
+        return 'struct marshal_cmd_{0}{1}'.format(self.name, '_packed' if is_packed else '')
+
+    def print_unmarshal_prototype(self, is_packed=False, suffix=''):
+        print(('uint32_t _mesa_unmarshal_{0}{1}(struct gl_context *ctx, '
+               'const {2} *restrict cmd){3}')
+               .format(self.name, '_packed' if is_packed else '',
+                       self.get_marshal_struct_name(is_packed), suffix))