[autofit] Speed up computation of af_reverse_character_map_new. (4/4)

With this commit, the start-up time for creating the reverse character map gets reduced from more than 300% to about 25% (as tested with `arial.ttf` version 7.00). * src/autofit/afadjust.c [FT_CONFIG_OPTION_USE_HARFBUZZ]: Include `afgsub.h`. (add_substitute): New auxiliary function to recursively add elements to the reverse cmap. (af_reverse_character_map_new): New code that replaces the old code removed in the previous commit.
2025-12-28 05:40:06 +01:00 · 2025-06-14 16:21:44 +02:00 · 2025-06-14 16:21:44 +02:00 · 6a982bde85
commit 6a982bde85
parent 9e8b642b10
1 changed files with 247 additions and 22 deletions
--- a/src/autofit/afadjust.c
+++ b/src/autofit/afadjust.c
@ -18,6 +18,9 @@
 */

 #include "afadjust.h"
+#ifdef FT_CONFIG_OPTION_USE_HARFBUZZ
+#  include "afgsub.h"
+#endif

 #include <freetype/freetype.h>
 #include <freetype/internal/ftobjs.h>
@ -1169,6 +1172,108 @@
  }


+#ifdef FT_CONFIG_OPTION_USE_HARFBUZZ
+
+  static FT_Error
+  add_substitute( FT_UInt    glyph_idx,
+                  size_t     value,
+                  FT_UInt32  codepoint,
+                  FT_Hash    reverse_map,
+                  FT_Hash    subst_map,
+                  FT_Memory  memory )
+  {
+    FT_Error  error;
+
+    FT_UInt  first_substitute = value & 0xFFFF;
+
+    FT_UInt  used = reverse_map->used;
+
+
+    /*
+      OpenType features like 'unic' map lowercase letter glyphs to uppercase
+      forms (and vice versa), which could lead to the use of wrong entries
+      in the adjustment database.  For this reason we don't overwrite,
+      prioritizing cmap entries.
+
+      XXX Note, however, that this cannot cover all cases since there might
+      be contradictory entries for glyphs not in the cmap.  A possible
+      solution might be to specially mark pairs of related lowercase and
+      uppercase characters in the adjustment database that have diacritics
+      on different vertical sides (for example, U+0122 'Ģ' and U+0123 'ģ'). 
+      The auto-hinter could then perform a topological analysis to do the
+      right thing.
+    */
+    error = ft_hash_num_insert_no_overwrite( first_substitute, codepoint,
+                                             reverse_map, memory );
+    if ( error )
+      return error;
+
+    if ( reverse_map->used > used )
+    {
+      size_t*  subst = ft_hash_num_lookup( first_substitute, subst_map );
+
+
+      if ( subst )
+      {
+        error = add_substitute( first_substitute, *subst, codepoint,
+                                reverse_map, subst_map, memory );
+        if ( error )
+          return error;
+      }
+    }
+
+    /* The remaining substitutes. */
+    if ( value & 0xFFFF0000U )
+    {
+      FT_UInt  num_substitutes = value >> 16;
+
+      FT_UInt  i;
+
+
+      for ( i = 1; i <= num_substitutes; i++ )
+      {
+        size_t*  substitute = ft_hash_num_lookup( glyph_idx + ( i << 16 ),
+                                                  subst_map );
+
+
+        used = reverse_map->used;
+
+        error = ft_hash_num_insert_no_overwrite( *substitute,
+                                                 codepoint,
+                                                 reverse_map,
+                                                 memory );
+        if ( error )
+          return error;
+
+        if ( reverse_map->used > used )
+        {
+          size_t*  subst = ft_hash_num_lookup( *substitute, subst_map );
+
+
+          if ( subst )
+          {
+            error = add_substitute( *substitute, *subst, codepoint,
+                                    reverse_map, subst_map, memory );
+            if ( error )
+              return error;
+          }
+        }
+      }
+    }
+
+    return FT_Err_Ok;
+  }
+
+#endif /* FT_CONFIG_OPTION_USE_HARFBUZZ */
+
+
+  /* Construct a 'reverse cmap' (i.e., a mapping from glyph indices to   */
+  /* character codes) for all glyphs that an input code point could turn */
+  /* into.                                                               */
+  /*                                                                     */
+  /* If HarfBuzz support is not available, this is the direct inversion  */
+  /* of the cmap table, otherwise the mapping gets extended with data    */
+  /* from the 'GSUB' table.                                              */
  FT_LOCAL_DEF( FT_Error )
  af_reverse_character_map_new( FT_Hash         *map,
                                AF_StyleMetrics  metrics )
@ -1184,19 +1289,6 @@
    FT_UInt32  codepoint;
    FT_Offset  i;

-#ifdef FT_CONFIG_OPTION_USE_HARFBUZZ
-    /* The next four variables are initialized to avoid compiler warnings. */
-    hb_font_t  *hb_font = NULL;
-    hb_face_t  *hb_face = NULL;
-
-    hb_set_t  *gsub_lookups = NULL;
-
-    hb_script_t  script;
-
-    unsigned int  script_count   = 1;
-    hb_tag_t      script_tags[2] = { HB_TAG_NONE, HB_TAG_NONE };
-#endif
-

    FT_TRACE4(( "af_reverse_character_map_new:"
                " building reverse character map (style `%s')\n",
@ -1219,9 +1311,69 @@
    if ( error )
      goto Exit;

+    /* Initialize reverse cmap with data directly from the cmap table. */
+    for ( i = 0; i < AF_ADJUSTMENT_DATABASE_LENGTH; i++ )
+    {
+      FT_Int  cmap_glyph;
+
+
+      /*
+        We cannot restrict `codepoint` to character ranges; we have no
+        control what data the script-specific portion of the GSUB table
+        actually holds.
+
+        An example is `arial.ttf` version 7.00; in this font, there are
+        lookups for Cyrillic (lookup 43), Greek (lookup 44), and Latin
+        (lookup 45) that map capital letter glyphs to small capital glyphs.
+        It is tempting to expect that script-specific versions of the 'c2sc'
+        feature only use script-specific lookups.  However, this is not the
+        case in this font: the feature uses all three lookups regardless of
+        the script.
+
+        The auto-hinter, while assigning glyphs to styles, uses the first
+        coverage result it encounters for a particular glyph.  For example,
+        if the coverage for Cyrillic is tested before Latin (as is currently
+        the case), glyphs without a cmap entry that are covered in 'c2sc'
+        are treated as Cyrillic.
+
+        If we now look at glyph 3498, which is a small-caps version of the
+        Latin character 'A grave' (U+00C0, glyph 172), we can see that it is
+        registered as belonging to a Cyrillic style due to the algorithm
+        just described.  As a result, checking only for characters from the
+        Latin range would miss this glyph; we thus have to test all
+        character codes in the database.
+      */
+      codepoint = adjustment_database[i].codepoint;
+
+      cmap_glyph = FT_Get_Char_Index( face, codepoint );
+      if ( cmap_glyph == 0 )
+        continue;
+
+      error = ft_hash_num_insert( cmap_glyph, codepoint, *map, memory );
+      if ( error )
+        goto Exit;
+    }
+
 #ifdef FT_CONFIG_OPTION_USE_HARFBUZZ

    {
+      hb_font_t  *hb_font;
+      hb_face_t  *hb_face;
+
+      hb_set_t    *gsub_lookups;
+      hb_script_t  script;
+
+      unsigned int  script_count   = 1;
+      hb_tag_t      script_tags[2] = { HB_TAG_NONE, HB_TAG_NONE };
+
+      FT_Hash  subst_map = NULL;
+
+      hb_codepoint_t  idx;
+      FT_UInt         hash_idx;
+      FT_Int          glyph_idx;
+      size_t          value;
+
+
      /* No need to check whether HarfBuzz has allocation issues; */
      /* it continues to work in such cases and simply returns    */
      /* 'empty' objects that do nothing.                         */
@ -1245,8 +1397,7 @@

 #ifdef FT_DEBUG_LEVEL_TRACE
      {
-        hb_codepoint_t  idx;
-        FT_Bool         have_idx = FALSE;
+        FT_Bool  have_idx = FALSE;


        FT_TRACE4(( "  GSUB lookups to check:\n" ));
@ -1267,17 +1418,91 @@
      }
 #endif

+      if ( FT_QNEW( subst_map ) )
+        goto Exit_HarfBuzz;
+
+      error = ft_hash_num_init( subst_map, memory );
+      if ( error )
+        goto Exit_HarfBuzz;
+
+      idx = HB_SET_VALUE_INVALID;
+      while ( hb( set_next )( gsub_lookups, &idx ) )
+      {
+        FT_UInt32  offset = globals->gsub_lookups_single_alternate[idx];
+
+
+        /* Put all substitutions into a single hash table.  Note that   */
+        /* the hash values usually contain more than a single character */
+        /* code; this can happen if different 'SingleSubst' subtables   */
+        /* map a given glyph index to different substitutions, or if    */
+        /* 'AlternateSubst' subtable entries are present.               */
+        if ( offset )
+          af_map_lookup( globals, subst_map, offset );
+      }
+
+      /*
+        Now iterate over the collected substitution data in `subst_map`
+        (using recursion to resolve one-to-many mappings) and insert the
+        data into the reverse cmap.
+
+        As an example, suppose we have the following cmap and substitution
+        data:
+
+          cmap: X -> a
+                Y -> b
+                Z -> c
+
+          substitutions: a -> b
+                         b -> c, d
+                         d -> e
+
+        The reverse map now becomes as follows.
+
+          a -> X
+          b -> Y
+          c -> Z (via cmap, ignoring mapping from 'b')
+          d -> Y (via 'b')
+          e -> Y (via 'b' and 'd')
+      */
+
+      hash_idx = 0;
+      while ( ft_hash_num_iterator( &hash_idx,
+                                    &glyph_idx,
+                                    &value,
+                                    subst_map ) )
+      {
+        size_t*  val;
+
+
+        /* Ignore keys that do not point to the first substitute. */
+        if ( glyph_idx & 0xFFFF0000 )
+          continue;
+
+        /* Ignore glyph indices that are not related to accents. */
+        val = ft_hash_num_lookup( glyph_idx, *map );
+        if ( !val )
+          continue;
+
+        codepoint = *val;
+
+        error = add_substitute( glyph_idx, value, codepoint,
+                                *map, subst_map, memory );
+        if ( error )
+          break;
+      }
+
+    Exit_HarfBuzz:
+      hb( set_destroy )( gsub_lookups );
+
+      ft_hash_num_free( subst_map, memory );
+      FT_FREE( subst_map );
+
+      if ( error )
+        goto Exit;
    }

 #endif /* FT_CONFIG_OPTION_USE_HARFBUZZ */

-#ifdef FT_CONFIG_OPTION_USE_HARFBUZZ
-    if ( hb( version_atleast )( 7, 2, 0 ) )
-    {
-      hb( set_destroy )( gsub_lookups );
-    }
-#endif
-
    FT_TRACE4(( "    reverse character map built successfully"
                " with %d entries\n", ( *map )->used ));