gfx/harfbuzz/src/hb-ot-shape-normalize.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/harfbuzz/src/hb-ot-shape-normalize.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,413 @@
     1.4 +/*
     1.5 + * Copyright © 2011,2012  Google, Inc.
     1.6 + *
     1.7 + *  This is part of HarfBuzz, a text shaping library.
     1.8 + *
     1.9 + * Permission is hereby granted, without written agreement and without
    1.10 + * license or royalty fees, to use, copy, modify, and distribute this
    1.11 + * software and its documentation for any purpose, provided that the
    1.12 + * above copyright notice and the following two paragraphs appear in
    1.13 + * all copies of this software.
    1.14 + *
    1.15 + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
    1.16 + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
    1.17 + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
    1.18 + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
    1.19 + * DAMAGE.
    1.20 + *
    1.21 + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
    1.22 + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
    1.23 + * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
    1.24 + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
    1.25 + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
    1.26 + *
    1.27 + * Google Author(s): Behdad Esfahbod
    1.28 + */
    1.29 +
    1.30 +#include "hb-ot-shape-normalize-private.hh"
    1.31 +#include "hb-ot-shape-complex-private.hh"
    1.32 +#include "hb-ot-shape-private.hh"
    1.33 +
    1.34 +
    1.35 +/*
    1.36 + * HIGHLEVEL DESIGN:
    1.37 + *
    1.38 + * This file exports one main function: _hb_ot_shape_normalize().
    1.39 + *
    1.40 + * This function closely reflects the Unicode Normalization Algorithm,
    1.41 + * yet it's different.
    1.42 + *
    1.43 + * Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
    1.44 + * The logic however tries to use whatever the font can support.
    1.45 + *
    1.46 + * In general what happens is that: each grapheme is decomposed in a chain
    1.47 + * of 1:2 decompositions, marks reordered, and then recomposed if desired,
    1.48 + * so far it's like Unicode Normalization.  However, the decomposition and
    1.49 + * recomposition only happens if the font supports the resulting characters.
    1.50 + *
    1.51 + * The goals are:
    1.52 + *
    1.53 + *   - Try to render all canonically equivalent strings similarly.  To really
    1.54 + *     achieve this we have to always do the full decomposition and then
    1.55 + *     selectively recompose from there.  It's kinda too expensive though, so
    1.56 + *     we skip some cases.  For example, if composed is desired, we simply
    1.57 + *     don't touch 1-character clusters that are supported by the font, even
    1.58 + *     though their NFC may be different.
    1.59 + *
    1.60 + *   - When a font has a precomposed character for a sequence but the 'ccmp'
    1.61 + *     feature in the font is not adequate, use the precomposed character
    1.62 + *     which typically has better mark positioning.
    1.63 + *
    1.64 + *   - When a font does not support a combining mark, but supports it precomposed
    1.65 + *     with previous base, use that.  This needs the itemizer to have this
    1.66 + *     knowledge too.  We need to provide assistance to the itemizer.
    1.67 + *
    1.68 + *   - When a font does not support a character but supports its decomposition,
    1.69 + *     well, use the decomposition (preferring the canonical decomposition, but
    1.70 + *     falling back to the compatibility decomposition if necessary).  The
    1.71 + *     compatibility decomposition is really nice to have, for characters like
    1.72 + *     ellipsis, or various-sized space characters.
    1.73 + *
    1.74 + *   - The complex shapers can customize the compose and decompose functions to
    1.75 + *     offload some of their requirements to the normalizer.  For example, the
    1.76 + *     Indic shaper may want to disallow recomposing of two matras.
    1.77 + *
    1.78 + *   - We try compatibility decomposition if decomposing through canonical
    1.79 + *     decomposition alone failed to find a sequence that the font supports.
    1.80 + *     We don't try compatibility decomposition recursively during the canonical
    1.81 + *     decomposition phase.  This has minimal impact.  There are only a handful
    1.82 + *     of Greek letter that have canonical decompositions that include characters
    1.83 + *     with compatibility decomposition.  Those can be found using this command:
    1.84 + *
    1.85 + *     egrep  "`echo -n ';('; grep ';<' UnicodeData.txt | cut -d';' -f1 | tr '\n' '|'; echo ') '`" UnicodeData.txt
    1.86 + */
    1.87 +
    1.88 +static bool
    1.89 +decompose_unicode (const hb_ot_shape_normalize_context_t *c,
    1.90 +		   hb_codepoint_t  ab,
    1.91 +		   hb_codepoint_t *a,
    1.92 +		   hb_codepoint_t *b)
    1.93 +{
    1.94 +  return c->unicode->decompose (ab, a, b);
    1.95 +}
    1.96 +
    1.97 +static bool
    1.98 +compose_unicode (const hb_ot_shape_normalize_context_t *c,
    1.99 +		 hb_codepoint_t  a,
   1.100 +		 hb_codepoint_t  b,
   1.101 +		 hb_codepoint_t *ab)
   1.102 +{
   1.103 +  return c->unicode->compose (a, b, ab);
   1.104 +}
   1.105 +
   1.106 +static inline void
   1.107 +set_glyph (hb_glyph_info_t &info, hb_font_t *font)
   1.108 +{
   1.109 +  font->get_glyph (info.codepoint, 0, &info.glyph_index());
   1.110 +}
   1.111 +
   1.112 +static inline void
   1.113 +output_char (hb_buffer_t *buffer, hb_codepoint_t unichar, hb_codepoint_t glyph)
   1.114 +{
   1.115 +  buffer->cur().glyph_index() = glyph;
   1.116 +  buffer->output_glyph (unichar);
   1.117 +  _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer->unicode);
   1.118 +}
   1.119 +
   1.120 +static inline void
   1.121 +next_char (hb_buffer_t *buffer, hb_codepoint_t glyph)
   1.122 +{
   1.123 +  buffer->cur().glyph_index() = glyph;
   1.124 +  buffer->next_glyph ();
   1.125 +}
   1.126 +
   1.127 +static inline void
   1.128 +skip_char (hb_buffer_t *buffer)
   1.129 +{
   1.130 +  buffer->skip_glyph ();
   1.131 +}
   1.132 +
   1.133 +/* Returns 0 if didn't decompose, number of resulting characters otherwise. */
   1.134 +static inline unsigned int
   1.135 +decompose (const hb_ot_shape_normalize_context_t *c, bool shortest, hb_codepoint_t ab)
   1.136 +{
   1.137 +  hb_codepoint_t a, b, a_glyph, b_glyph;
   1.138 +  hb_buffer_t * const buffer = c->buffer;
   1.139 +  hb_font_t * const font = c->font;
   1.140 +
   1.141 +  if (!c->decompose (c, ab, &a, &b) ||
   1.142 +      (b && !font->get_glyph (b, 0, &b_glyph)))
   1.143 +    return 0;
   1.144 +
   1.145 +  bool has_a = font->get_glyph (a, 0, &a_glyph);
   1.146 +  if (shortest && has_a) {
   1.147 +    /* Output a and b */
   1.148 +    output_char (buffer, a, a_glyph);
   1.149 +    if (likely (b)) {
   1.150 +      output_char (buffer, b, b_glyph);
   1.151 +      return 2;
   1.152 +    }
   1.153 +    return 1;
   1.154 +  }
   1.155 +
   1.156 +  unsigned int ret;
   1.157 +  if ((ret = decompose (c, shortest, a))) {
   1.158 +    if (b) {
   1.159 +      output_char (buffer, b, b_glyph);
   1.160 +      return ret + 1;
   1.161 +    }
   1.162 +    return ret;
   1.163 +  }
   1.164 +
   1.165 +  if (has_a) {
   1.166 +    output_char (buffer, a, a_glyph);
   1.167 +    if (likely (b)) {
   1.168 +      output_char (buffer, b, b_glyph);
   1.169 +      return 2;
   1.170 +    }
   1.171 +    return 1;
   1.172 +  }
   1.173 +
   1.174 +  return 0;
   1.175 +}
   1.176 +
   1.177 +/* Returns 0 if didn't decompose, number of resulting characters otherwise. */
   1.178 +static inline unsigned int
   1.179 +decompose_compatibility (const hb_ot_shape_normalize_context_t *c, hb_codepoint_t u)
   1.180 +{
   1.181 +  unsigned int len, i;
   1.182 +  hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN];
   1.183 +  hb_codepoint_t glyphs[HB_UNICODE_MAX_DECOMPOSITION_LEN];
   1.184 +
   1.185 +  len = c->buffer->unicode->decompose_compatibility (u, decomposed);
   1.186 +  if (!len)
   1.187 +    return 0;
   1.188 +
   1.189 +  for (i = 0; i < len; i++)
   1.190 +    if (!c->font->get_glyph (decomposed[i], 0, &glyphs[i]))
   1.191 +      return 0;
   1.192 +
   1.193 +  for (i = 0; i < len; i++)
   1.194 +    output_char (c->buffer, decomposed[i], glyphs[i]);
   1.195 +
   1.196 +  return len;
   1.197 +}
   1.198 +
   1.199 +static inline void
   1.200 +decompose_current_character (const hb_ot_shape_normalize_context_t *c, bool shortest)
   1.201 +{
   1.202 +  hb_buffer_t * const buffer = c->buffer;
   1.203 +  hb_codepoint_t glyph;
   1.204 +
   1.205 +  /* Kind of a cute waterfall here... */
   1.206 +  if (shortest && c->font->get_glyph (buffer->cur().codepoint, 0, &glyph))
   1.207 +    next_char (buffer, glyph);
   1.208 +  else if (decompose (c, shortest, buffer->cur().codepoint))
   1.209 +    skip_char (buffer);
   1.210 +  else if (!shortest && c->font->get_glyph (buffer->cur().codepoint, 0, &glyph))
   1.211 +    next_char (buffer, glyph);
   1.212 +  else if (decompose_compatibility (c, buffer->cur().codepoint))
   1.213 +    skip_char (buffer);
   1.214 +  else
   1.215 +    next_char (buffer, glyph); /* glyph is initialized in earlier branches. */
   1.216 +}
   1.217 +
   1.218 +static inline void
   1.219 +handle_variation_selector_cluster (const hb_ot_shape_normalize_context_t *c, unsigned int end, bool short_circuit)
   1.220 +{
   1.221 +  /* TODO Currently if there's a variation-selector we give-up, it's just too hard. */
   1.222 +  hb_buffer_t * const buffer = c->buffer;
   1.223 +  hb_font_t * const font = c->font;
   1.224 +  for (; buffer->idx < end - 1;) {
   1.225 +    if (unlikely (buffer->unicode->is_variation_selector (buffer->cur(+1).codepoint))) {
   1.226 +      /* The next two lines are some ugly lines... But work. */
   1.227 +      if (font->get_glyph (buffer->cur().codepoint, buffer->cur(+1).codepoint, &buffer->cur().glyph_index()))
   1.228 +      {
   1.229 +	buffer->replace_glyphs (2, 1, &buffer->cur().codepoint);
   1.230 +      }
   1.231 +      else
   1.232 +      {
   1.233 +        /* Just pass on the two characters separately, let GSUB do its magic. */
   1.234 +	set_glyph (buffer->cur(), font);
   1.235 +	buffer->next_glyph ();
   1.236 +	set_glyph (buffer->cur(), font);
   1.237 +	buffer->next_glyph ();
   1.238 +      }
   1.239 +      /* Skip any further variation selectors. */
   1.240 +      while (buffer->idx < end && unlikely (buffer->unicode->is_variation_selector (buffer->cur().codepoint)))
   1.241 +      {
   1.242 +	set_glyph (buffer->cur(), font);
   1.243 +	buffer->next_glyph ();
   1.244 +      }
   1.245 +    } else {
   1.246 +      set_glyph (buffer->cur(), font);
   1.247 +      buffer->next_glyph ();
   1.248 +    }
   1.249 +  }
   1.250 +  if (likely (buffer->idx < end)) {
   1.251 +    set_glyph (buffer->cur(), font);
   1.252 +    buffer->next_glyph ();
   1.253 +  }
   1.254 +}
   1.255 +
   1.256 +static inline void
   1.257 +decompose_multi_char_cluster (const hb_ot_shape_normalize_context_t *c, unsigned int end, bool short_circuit)
   1.258 +{
   1.259 +  hb_buffer_t * const buffer = c->buffer;
   1.260 +  for (unsigned int i = buffer->idx; i < end; i++)
   1.261 +    if (unlikely (buffer->unicode->is_variation_selector (buffer->info[i].codepoint))) {
   1.262 +      handle_variation_selector_cluster (c, end, short_circuit);
   1.263 +      return;
   1.264 +    }
   1.265 +
   1.266 +  while (buffer->idx < end)
   1.267 +    decompose_current_character (c, short_circuit);
   1.268 +}
   1.269 +
   1.270 +static inline void
   1.271 +decompose_cluster (const hb_ot_shape_normalize_context_t *c, unsigned int end, bool might_short_circuit, bool always_short_circuit)
   1.272 +{
   1.273 +  if (likely (c->buffer->idx + 1 == end))
   1.274 +    decompose_current_character (c, might_short_circuit);
   1.275 +  else
   1.276 +    decompose_multi_char_cluster (c, end, always_short_circuit);
   1.277 +}
   1.278 +
   1.279 +
   1.280 +static int
   1.281 +compare_combining_class (const hb_glyph_info_t *pa, const hb_glyph_info_t *pb)
   1.282 +{
   1.283 +  unsigned int a = _hb_glyph_info_get_modified_combining_class (pa);
   1.284 +  unsigned int b = _hb_glyph_info_get_modified_combining_class (pb);
   1.285 +
   1.286 +  return a < b ? -1 : a == b ? 0 : +1;
   1.287 +}
   1.288 +
   1.289 +
   1.290 +void
   1.291 +_hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
   1.292 +			hb_buffer_t *buffer,
   1.293 +			hb_font_t *font)
   1.294 +{
   1.295 +  hb_ot_shape_normalization_mode_t mode = plan->shaper->normalization_preference;
   1.296 +  const hb_ot_shape_normalize_context_t c = {
   1.297 +    plan,
   1.298 +    buffer,
   1.299 +    font,
   1.300 +    buffer->unicode,
   1.301 +    plan->shaper->decompose ? plan->shaper->decompose : decompose_unicode,
   1.302 +    plan->shaper->compose   ? plan->shaper->compose   : compose_unicode
   1.303 +  };
   1.304 +
   1.305 +  bool always_short_circuit = mode == HB_OT_SHAPE_NORMALIZATION_MODE_NONE;
   1.306 +  bool might_short_circuit = always_short_circuit ||
   1.307 +			     (mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED &&
   1.308 +			      mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT);
   1.309 +  unsigned int count;
   1.310 +
   1.311 +  /* We do a fairly straightforward yet custom normalization process in three
   1.312 +   * separate rounds: decompose, reorder, recompose (if desired).  Currently
   1.313 +   * this makes two buffer swaps.  We can make it faster by moving the last
   1.314 +   * two rounds into the inner loop for the first round, but it's more readable
   1.315 +   * this way. */
   1.316 +
   1.317 +
   1.318 +  /* First round, decompose */
   1.319 +
   1.320 +  buffer->clear_output ();
   1.321 +  count = buffer->len;
   1.322 +  for (buffer->idx = 0; buffer->idx < count;)
   1.323 +  {
   1.324 +    unsigned int end;
   1.325 +    for (end = buffer->idx + 1; end < count; end++)
   1.326 +      if (buffer->cur().cluster != buffer->info[end].cluster)
   1.327 +        break;
   1.328 +
   1.329 +    decompose_cluster (&c, end, might_short_circuit, always_short_circuit);
   1.330 +  }
   1.331 +  buffer->swap_buffers ();
   1.332 +
   1.333 +
   1.334 +  /* Second round, reorder (inplace) */
   1.335 +
   1.336 +  count = buffer->len;
   1.337 +  for (unsigned int i = 0; i < count; i++)
   1.338 +  {
   1.339 +    if (_hb_glyph_info_get_modified_combining_class (&buffer->info[i]) == 0)
   1.340 +      continue;
   1.341 +
   1.342 +    unsigned int end;
   1.343 +    for (end = i + 1; end < count; end++)
   1.344 +      if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0)
   1.345 +        break;
   1.346 +
   1.347 +    /* We are going to do a bubble-sort.  Only do this if the
   1.348 +     * sequence is short.  Doing it on long sequences can result
   1.349 +     * in an O(n^2) DoS. */
   1.350 +    if (end - i > 10) {
   1.351 +      i = end;
   1.352 +      continue;
   1.353 +    }
   1.354 +
   1.355 +    hb_bubble_sort (buffer->info + i, end - i, compare_combining_class);
   1.356 +
   1.357 +    i = end;
   1.358 +  }
   1.359 +
   1.360 +
   1.361 +  if (mode == HB_OT_SHAPE_NORMALIZATION_MODE_NONE ||
   1.362 +      mode == HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED)
   1.363 +    return;
   1.364 +
   1.365 +  /* Third round, recompose */
   1.366 +
   1.367 +  /* As noted in the comment earlier, we don't try to combine
   1.368 +   * ccc=0 chars with their previous Starter. */
   1.369 +
   1.370 +  buffer->clear_output ();
   1.371 +  count = buffer->len;
   1.372 +  unsigned int starter = 0;
   1.373 +  buffer->next_glyph ();
   1.374 +  while (buffer->idx < count)
   1.375 +  {
   1.376 +    hb_codepoint_t composed, glyph;
   1.377 +    if (/* We don't try to compose a non-mark character with it's preceding starter.
   1.378 +	 * This is both an optimization to avoid trying to compose every two neighboring
   1.379 +	 * glyphs in most scripts AND a desired feature for Hangul.  Apparently Hangul
   1.380 +	 * fonts are not designed to mix-and-match pre-composed syllables and Jamo. */
   1.381 +	HB_UNICODE_GENERAL_CATEGORY_IS_MARK (_hb_glyph_info_get_general_category (&buffer->cur())) &&
   1.382 +	/* If there's anything between the starter and this char, they should have CCC
   1.383 +	 * smaller than this character's. */
   1.384 +	(starter == buffer->out_len - 1 ||
   1.385 +	 _hb_glyph_info_get_modified_combining_class (&buffer->prev()) < _hb_glyph_info_get_modified_combining_class (&buffer->cur())) &&
   1.386 +	/* And compose. */
   1.387 +	c.compose (&c,
   1.388 +		   buffer->out_info[starter].codepoint,
   1.389 +		   buffer->cur().codepoint,
   1.390 +		   &composed) &&
   1.391 +	/* And the font has glyph for the composite. */
   1.392 +	font->get_glyph (composed, 0, &glyph))
   1.393 +    {
   1.394 +      /* Composes. */
   1.395 +      buffer->next_glyph (); /* Copy to out-buffer. */
   1.396 +      if (unlikely (buffer->in_error))
   1.397 +        return;
   1.398 +      buffer->merge_out_clusters (starter, buffer->out_len);
   1.399 +      buffer->out_len--; /* Remove the second composable. */
   1.400 +      /* Modify starter and carry on. */
   1.401 +      buffer->out_info[starter].codepoint = composed;
   1.402 +      buffer->out_info[starter].glyph_index() = glyph;
   1.403 +      _hb_glyph_info_set_unicode_props (&buffer->out_info[starter], buffer->unicode);
   1.404 +
   1.405 +      continue;
   1.406 +    }
   1.407 +
   1.408 +    /* Blocked, or doesn't compose. */
   1.409 +    buffer->next_glyph ();
   1.410 +
   1.411 +    if (_hb_glyph_info_get_modified_combining_class (&buffer->prev()) == 0)
   1.412 +      starter = buffer->out_len - 1;
   1.413 +  }
   1.414 +  buffer->swap_buffers ();
   1.415 +
   1.416 +}

mercurial