glthread: generate packed versions of gl*Pointer/Offset calls

The pointer/offset parameter is often NULL or a small number with VBOs.

The idea is:
- If the pointer/offset parameter is NULL/0, use a different cmd structure
  and unmarshal function that doesn't contain the pointer/offset parameter
  to save 8 bytes per call.
- If the cmd structure has a hole and the pointer/offset parameter is
  a small number that fits into the hole, use a different cmd structure
  and unmarshal function that stores the value within the hole using
  a smaller type to save 8 bytes per call.

This implements those ideas. It will continue generating the most optimal
code even if we change the packing of other parameters.

This decreases the size of 1 frame in glthread batches by 21%
in Viewperf2020/Catia1.

Example of generated code for glVertexPointer with and without the pointer
parameter if it's NULL. See the arrows for comments.

/* VertexPointer: marshalled asynchronously */
struct marshal_cmd_VertexPointer
{
   struct marshal_cmd_base cmd_base;
   GLpacked16i size;
   GLenum16 type;
   GLclamped16i stride;
   const GLvoid * pointer;
};
struct marshal_cmd_VertexPointer_packed
{
   struct marshal_cmd_base cmd_base;
   GLpacked16i size;
   GLenum16 type;
   GLclamped16i stride;                                 // <------- no "pointer"
};
uint32_t _mesa_unmarshal_VertexPointer(struct gl_context *ctx, const struct marshal_cmd_VertexPointer *restrict cmd)
{
   GLpacked16i size = cmd->size;
   GLenum16 type = cmd->type;
   GLclamped16i stride = cmd->stride;
   const GLvoid * pointer = cmd->pointer;
   CALL_VertexPointer(ctx->Dispatch.Current, (size, type, stride, pointer));
   return align(sizeof(struct marshal_cmd_VertexPointer), 8) / 8;
}
uint32_t _mesa_unmarshal_VertexPointer_packed(struct gl_context *ctx, const struct marshal_cmd_VertexPointer_packed *restrict cmd)
{
   GLpacked16i size = cmd->size;
   GLenum16 type = cmd->type;
   GLclamped16i stride = cmd->stride;
   const GLvoid * pointer = (const GLvoid *)(uintptr_t)0;       // <------- using NULL
   CALL_VertexPointer(ctx->Dispatch.Current, (size, type, stride, pointer));
   return align(sizeof(struct marshal_cmd_VertexPointer_packed), 8) / 8;
}
static void GLAPIENTRY
_mesa_marshal_VertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer)
{
   GET_CURRENT_CONTEXT(ctx);
   if (!pointer) {                              // <------- the condition
      int cmd_size = sizeof(struct marshal_cmd_VertexPointer_packed);
      struct marshal_cmd_VertexPointer_packed *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_VertexPointer_packed, cmd_size);
      cmd->size = size < 0 ? UINT16_MAX : MIN2(size, UINT16_MAX);
      cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */
      cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX);
   } else {
      int cmd_size = sizeof(struct marshal_cmd_VertexPointer);
      struct marshal_cmd_VertexPointer *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_VertexPointer, cmd_size);
      cmd->size = size < 0 ? UINT16_MAX : MIN2(size, UINT16_MAX);
      cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */
      cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX);
      cmd->pointer = pointer;
   }
   _mesa_glthread_AttribPointer(ctx, VERT_ATTRIB_POS, MESA_PACK_VFORMAT(type, size, 0, 0, 0), stride, pointer);
}

Example of generated code for glNormalPointer using a smaller type:

/* NormalPointer: marshalled asynchronously */
struct marshal_cmd_NormalPointer
{
   struct marshal_cmd_base cmd_base;
   GLenum16 type;
   GLclamped16i stride;
   const GLvoid * pointer;
};
struct marshal_cmd_NormalPointer_packed
{
   struct marshal_cmd_base cmd_base;
   GLenum16 type;
   GLclamped16i stride;
   GLushort pointer;                                    // <-------- truncated "pointer"
};
uint32_t _mesa_unmarshal_NormalPointer(struct gl_context *ctx, const struct marshal_cmd_NormalPointer *restrict cmd)
{
   GLenum16 type = cmd->type;
   GLclamped16i stride = cmd->stride;
   const GLvoid * pointer = cmd->pointer;
   CALL_NormalPointer(ctx->Dispatch.Current, (type, stride, pointer));
   return align(sizeof(struct marshal_cmd_NormalPointer), 8) / 8;
}
uint32_t _mesa_unmarshal_NormalPointer_packed(struct gl_context *ctx, const struct marshal_cmd_NormalPointer_packed *restrict cmd)
{
   GLenum16 type = cmd->type;
   GLclamped16i stride = cmd->stride;
   const GLvoid * pointer = (const GLvoid *)(uintptr_t)cmd->pointer;  // <-------- upcasting
   CALL_NormalPointer(ctx->Dispatch.Current, (type, stride, pointer));
   return align(sizeof(struct marshal_cmd_NormalPointer_packed), 8) / 8;
}
static void GLAPIENTRY
_mesa_marshal_NormalPointer(GLenum type, GLsizei stride, const GLvoid *pointer)
{
   GET_CURRENT_CONTEXT(ctx);
   if (((uintptr_t)pointer & 0xffff) == (uintptr_t)pointer) {        // <-------- the condition
      int cmd_size = sizeof(struct marshal_cmd_NormalPointer_packed);
      struct marshal_cmd_NormalPointer_packed *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_NormalPointer_packed, cmd_size);
      cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */
      cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX);
      cmd->pointer = (uintptr_t)pointer; /* truncated */             // <-------- the truncation
   } else {
      int cmd_size = sizeof(struct marshal_cmd_NormalPointer);
      struct marshal_cmd_NormalPointer *cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_NormalPointer, cmd_size);
      cmd->type = MIN2(type, 0xffff); /* clamped to 0xffff (invalid enum) */
      cmd->stride = CLAMP(stride, INT16_MIN, INT16_MAX);
      cmd->pointer = pointer;
   }
   _mesa_glthread_AttribPointer(ctx, VERT_ATTRIB_NORMAL, MESA_PACK_VFORMAT(type, 3, 1, 0, 0), stride, pointer);
}

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27350>
This commit is contained in:
Marek Olšák 2024-01-08 20:59:37 -05:00 committed by Marge Bot
parent 24f14f8daa
commit c9abb7ff6e
4 changed files with 141 additions and 23 deletions

View file

@ -103,19 +103,23 @@ class PrintCode(gl_XML.gl_print_base):
out('')
out('')
def print_unmarshal_func(self, func):
out('uint32_t')
out(('_mesa_unmarshal_{0}(struct gl_context *ctx, '
'const struct marshal_cmd_{0} *restrict cmd)').format(func.name))
def print_unmarshal_func(self, func, is_packed=False):
func.print_unmarshal_prototype(is_packed=is_packed)
out('{')
with indent():
for p in func.fixed_params:
type = func.get_marshal_type(p)
if p.count:
p_decl = '{0} *{1} = cmd->{1};'.format(
p.get_base_type_string(), p.name)
elif is_packed and func.packed_param_name == p.name:
if func.packed_param_size == 0:
p_decl = '{0} {1} = ({0})(uintptr_t)0;'.format(type, p.name)
else:
p_decl = '{0} {1} = ({0})(uintptr_t)cmd->{1};'.format(type, p.name)
else:
p_decl = '{0} {1} = cmd->{1};'.format(
func.get_marshal_type(p), p.name)
p_decl = '{0} {1} = cmd->{1};'.format(type, p.name)
if not p_decl.startswith('const ') and p.count:
# Declare all local function variables as const, even if
@ -150,12 +154,14 @@ class PrintCode(gl_XML.gl_print_base):
if func.variable_params:
out('return cmd->num_slots;')
else:
struct = 'struct marshal_cmd_{0}'.format(func.name)
out('return align(sizeof({0}), 8) / 8;'.format(struct))
out('return align(sizeof({0}), 8) / 8;'.format(func.get_marshal_struct_name(is_packed)))
out('}')
def print_marshal_async_code(self, func):
struct = 'struct marshal_cmd_{0}'.format(func.name)
if not is_packed and func.packed_fixed_params:
self.print_unmarshal_func(func, is_packed=True)
def print_marshal_async_code(self, func, is_packed=False):
struct = func.get_marshal_struct_name(is_packed)
if func.marshal_sync:
out('int cmd_size = sizeof({0});'.format(struct))
@ -205,17 +211,24 @@ class PrintCode(gl_XML.gl_print_base):
out('assert(cmd_size >= 0 && cmd_size <= MARSHAL_MAX_CMD_SIZE);')
# Add the call into the batch.
out('{0} *cmd = _mesa_glthread_allocate_command(ctx, '
'DISPATCH_CMD_{1}, cmd_size);'.format(struct, func.name))
dispatch_cmd = 'DISPATCH_CMD_{0}{1}'.format(func.name, '_packed' if is_packed else '')
if func.get_fixed_params(is_packed) or func.variable_params:
out('{0} *cmd = _mesa_glthread_allocate_command(ctx, {1}, cmd_size);'
.format(struct, dispatch_cmd))
else:
out('_mesa_glthread_allocate_command(ctx, {0}, cmd_size);'.format(dispatch_cmd))
if func.variable_params:
out('cmd->num_slots = align(cmd_size, 8) / 8;')
for p in func.fixed_params:
for p in func.get_fixed_params(is_packed):
type = func.get_marshal_type(p)
if p.count:
out('memcpy(cmd->{0}, {0}, {1});'.format(
p.name, p.size_string()))
elif is_packed and p.name == func.packed_param_name:
out('cmd->{0} = (uintptr_t){0}; /* truncated */'.format(p.name))
elif type == 'GLenum8':
out('cmd->{0} = MIN2({0}, 0xff); /* clamped to 0xff (invalid enum) */'.format(p.name))
elif type == 'GLenum16':
@ -226,6 +239,7 @@ class PrintCode(gl_XML.gl_print_base):
out('cmd->{0} = {0} < 0 ? UINT16_MAX : MIN2({0}, UINT16_MAX);'.format(p.name))
else:
out('cmd->{0} = {0};'.format(p.name))
if func.variable_params:
out('char *variable_data = (char *) (cmd + 1);')
i = 1
@ -244,9 +258,6 @@ class PrintCode(gl_XML.gl_print_base):
out('variable_data += {0}_size;'.format(p.name))
i += 1
if not func.fixed_params and not func.variable_params:
out('(void) cmd;')
def print_async_body(self, func):
out('/* {0}: marshalled asynchronously */'.format(func.name))
func.print_struct()
@ -261,7 +272,22 @@ class PrintCode(gl_XML.gl_print_base):
if func.marshal_call_before:
out(func.marshal_call_before);
self.print_marshal_async_code(func)
if func.packed_fixed_params:
if func.packed_param_size > 0:
out('if (((uintptr_t){0} & 0x{1}) == (uintptr_t){0}) {{'
.format(func.packed_param_name,
'ff' * func.packed_param_size))
else:
out('if (!{0}) {{'.format(func.packed_param_name))
with indent():
self.print_marshal_async_code(func, is_packed=True)
out('} else {')
with indent():
self.print_marshal_async_code(func)
out('}')
else:
self.print_marshal_async_code(func)
if func.marshal_call_after:
out(func.marshal_call_after)

View file

@ -61,6 +61,8 @@ class PrintCode(gl_XML.gl_print_base):
if flavor in ('skip', 'sync'):
continue
print(' DISPATCH_CMD_{0},'.format(func.name))
if func.packed_fixed_params:
print(' DISPATCH_CMD_{0}_packed,'.format(func.name))
print(' NUM_DISPATCH_CMD,')
print('};')
print('')
@ -71,8 +73,9 @@ class PrintCode(gl_XML.gl_print_base):
flavor = func.marshal_flavor()
if flavor in ('custom', 'async'):
print(('uint32_t _mesa_unmarshal_{0}(struct gl_context *ctx, '
'const struct marshal_cmd_{0} *restrict cmd);').format(func.name))
func.print_unmarshal_prototype(suffix=';')
if func.packed_fixed_params:
func.print_unmarshal_prototype(suffix=';', is_packed=True)
if flavor in ('custom', 'async', 'sync') and not func.marshal_is_static():
print('{0} GLAPIENTRY _mesa_marshal_{1}({2});'.format(func.return_type, func.name, func.get_parameter_string()))

View file

@ -68,6 +68,8 @@ class PrintCode(gl_XML.gl_print_base):
if func.marshal_flavor() in ('skip', 'sync'):
continue
out('[DISPATCH_CMD_{0}] = (_mesa_unmarshal_func)_mesa_unmarshal_{0},'.format(func.name))
if func.packed_fixed_params:
out('[DISPATCH_CMD_{0}_packed] = (_mesa_unmarshal_func)_mesa_unmarshal_{0}_packed,'.format(func.name))
out('};')
# Print the string table of function names.
@ -78,6 +80,8 @@ class PrintCode(gl_XML.gl_print_base):
if func.marshal_flavor() in ('skip', 'sync'):
continue
out('[DISPATCH_CMD_{0}] = "{0}",'.format(func.name))
if func.packed_fixed_params:
out('[DISPATCH_CMD_{0}_packed] = "{0}_packed",'.format(func.name))
out('};')

View file

@ -25,6 +25,11 @@
import gl_XML
import sys
import copy
import typeexpr
def pot_align(base, pot_alignment):
return (base + pot_alignment - 1) & ~(pot_alignment - 1);
class marshal_item_factory(gl_XML.gl_item_factory):
@ -163,6 +168,74 @@ class marshal_function(gl_XML.gl_function):
# from smallest to biggest.
self.fixed_params = sorted(self.fixed_params, key=lambda p: self.get_type_size(p))
# Compute the marshal structure size and the largest hole
self.struct_size = 2 # sizeof(struct marshal_cmd_base)
largest_hole = 0
for p in self.fixed_params:
type_size = self.get_type_size(p)
aligned_size = pot_align(self.struct_size, type_size)
largest_hole = max(aligned_size - self.struct_size, largest_hole)
self.struct_size = aligned_size
self.struct_size = self.struct_size + type_size
# Round down largest_hole to a power of two.
largest_hole = int(2 ** (largest_hole.bit_length() - 1))
# Align the structure to 8 bytes.
aligned_size = pot_align(self.struct_size, 8)
padding_hole = aligned_size - self.struct_size
self.struct_size = aligned_size
# Determine whether to generate a packed version of gl*Pointer calls.
# If there is a hole in the cmd structure, the pointer/offset parameter
# can be truncated and stored in the hole to save 8 bytes per call.
# The version of the structure is determined at runtime based on
# whether the truncation doesn't change the value. This is common with
# VBOs because the pointer/offset is usually small.
#
# If there is no hole, the packed version completely removes
# the pointer/offset parameter and is used when the value is NULL/0
# to remove 8 bytes per call. This is common with VBOs.
self.packed_param_name = None
if (self.is_vertex_pointer_call and
# 32-bit CPUs only benefit if we remove the whole 8-byte slot,
# which means there must be exactly 4-byte padding after the 4-byte
# pointer/offset parameter.
(self.context.pointer_size != 4 or padding_hole == 4)):
for pname in ['pointer', 'offset']:
if pname in [p.name for p in self.fixed_params]:
self.packed_param_name = pname
assert self.packed_param_name
assert not self.variable_params
assert not self.marshal_sync
# Prepare the parameters for the packed version by replacing the type
# of the packed variable or removing it completely.
self.packed_fixed_params = []
if self.packed_param_name:
for p in self.fixed_params:
if p.name == self.packed_param_name:
if largest_hole > 0:
# Select the truncated type.
type = ['GLubyte', 'GLushort', 'GLuint'][largest_hole.bit_length() - 1]
# Clone the parameter and change its type
new_param = copy.deepcopy(p)
new_param.type_expr = typeexpr.type_expression(type, self.context)
self.packed_fixed_params.append(new_param)
else:
self.packed_fixed_params.append(p)
self.packed_param_size = largest_hole
# Sort the parameters by size to move the truncated type into the hole.
self.packed_fixed_params = sorted(self.packed_fixed_params, key=lambda p: self.get_type_size(p))
def get_fixed_params(self, is_packed):
return self.packed_fixed_params if is_packed else self.fixed_params
def marshal_flavor(self):
"""Find out how this function should be marshalled between
client and server threads."""
@ -195,15 +268,15 @@ class marshal_function(gl_XML.gl_function):
self.name[0:8] != 'Internal' and
self.exec_flavor != 'beginend')
def print_struct(self, is_header=False):
def print_struct(self, is_header=False, is_packed=False):
if (self.marshal_struct == 'public') == is_header:
print('struct marshal_cmd_{0}'.format(self.name))
print(self.get_marshal_struct_name(is_packed))
print('{')
print(' struct marshal_cmd_base cmd_base;')
if self.variable_params:
print(' uint16_t num_slots;')
for p in self.fixed_params:
for p in self.get_fixed_params(is_packed):
if p.count:
print(' {0} {1}[{2}];'.format(
p.get_base_type_string(), p.name, p.count))
@ -228,4 +301,16 @@ class marshal_function(gl_XML.gl_function):
p.name, p.counter))
print('};')
elif self.marshal_flavor() in ('custom', 'async'):
print('struct marshal_cmd_{0};'.format(self.name))
print('{0};'.format(self.get_marshal_struct_name(is_packed)))
if not is_packed and self.packed_fixed_params:
self.print_struct(is_header, True)
def get_marshal_struct_name(self, is_packed=False):
return 'struct marshal_cmd_{0}{1}'.format(self.name, '_packed' if is_packed else '')
def print_unmarshal_prototype(self, is_packed=False, suffix=''):
print(('uint32_t _mesa_unmarshal_{0}{1}(struct gl_context *ctx, '
'const {2} *restrict cmd){3}')
.format(self.name, '_packed' if is_packed else '',
self.get_marshal_struct_name(is_packed), suffix))