intl/hyphenation/src/nsHyphenator.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/hyphenation/src/nsHyphenator.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,122 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "nsHyphenator.h"
    1.10 +#include "nsIFile.h"
    1.11 +#include "nsUTF8Utils.h"
    1.12 +#include "nsUnicodeProperties.h"
    1.13 +#include "nsUnicharUtilCIID.h"
    1.14 +#include "nsIURI.h"
    1.15 +
    1.16 +#include "hyphen.h"
    1.17 +
    1.18 +nsHyphenator::nsHyphenator(nsIURI *aURI)
    1.19 +  : mDict(nullptr)
    1.20 +{
    1.21 +  nsCString uriSpec;
    1.22 +  nsresult rv = aURI->GetSpec(uriSpec);
    1.23 +  if (NS_FAILED(rv)) {
    1.24 +    return;
    1.25 +  }
    1.26 +  mDict = hnj_hyphen_load(uriSpec.get());
    1.27 +#ifdef DEBUG
    1.28 +  if (mDict) {
    1.29 +    printf("loaded hyphenation patterns from %s\n", uriSpec.get());
    1.30 +  }
    1.31 +#endif
    1.32 +}
    1.33 +
    1.34 +nsHyphenator::~nsHyphenator()
    1.35 +{
    1.36 +  if (mDict != nullptr) {
    1.37 +    hnj_hyphen_free((HyphenDict*)mDict);
    1.38 +    mDict = nullptr;
    1.39 +  }
    1.40 +}
    1.41 +
    1.42 +bool
    1.43 +nsHyphenator::IsValid()
    1.44 +{
    1.45 +  return (mDict != nullptr);
    1.46 +}
    1.47 +
    1.48 +nsresult
    1.49 +nsHyphenator::Hyphenate(const nsAString& aString,
    1.50 +                        FallibleTArray<bool>& aHyphens)
    1.51 +{
    1.52 +  if (!aHyphens.SetLength(aString.Length())) {
    1.53 +    return NS_ERROR_OUT_OF_MEMORY;
    1.54 +  }
    1.55 +  memset(aHyphens.Elements(), false, aHyphens.Length());
    1.56 +
    1.57 +  bool inWord = false;
    1.58 +  uint32_t wordStart = 0, wordLimit = 0;
    1.59 +  uint32_t chLen;
    1.60 +  for (uint32_t i = 0; i < aString.Length(); i += chLen) {
    1.61 +    uint32_t ch = aString[i];
    1.62 +    chLen = 1;
    1.63 +
    1.64 +    if (NS_IS_HIGH_SURROGATE(ch)) {
    1.65 +      if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) {
    1.66 +        ch = SURROGATE_TO_UCS4(ch, aString[i+1]);
    1.67 +        chLen = 2;
    1.68 +      } else {
    1.69 +        NS_WARNING("unpaired surrogate found during hyphenation");
    1.70 +      }
    1.71 +    }
    1.72 +
    1.73 +    nsIUGenCategory::nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch);
    1.74 +    if (cat == nsIUGenCategory::kLetter || cat == nsIUGenCategory::kMark) {
    1.75 +      if (!inWord) {
    1.76 +        inWord = true;
    1.77 +        wordStart = i;
    1.78 +      }
    1.79 +      wordLimit = i + chLen;
    1.80 +      if (i + chLen < aString.Length()) {
    1.81 +        continue;
    1.82 +      }
    1.83 +    }
    1.84 +
    1.85 +    if (inWord) {
    1.86 +      const char16_t *begin = aString.BeginReading();
    1.87 +      NS_ConvertUTF16toUTF8 utf8(begin + wordStart,
    1.88 +                                 wordLimit - wordStart);
    1.89 +      nsAutoTArray<char,200> utf8hyphens;
    1.90 +      utf8hyphens.SetLength(utf8.Length() + 5);
    1.91 +      char **rep = nullptr;
    1.92 +      int *pos = nullptr;
    1.93 +      int *cut = nullptr;
    1.94 +      int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict,
    1.95 +                                      utf8.BeginReading(), utf8.Length(),
    1.96 +                                      utf8hyphens.Elements(), nullptr,
    1.97 +                                      &rep, &pos, &cut);
    1.98 +      if (!err) {
    1.99 +        // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
   1.100 +        // from utf8 code unit indexing (which would match the utf8 input
   1.101 +        // string directly) to Unicode character indexing.
   1.102 +        // We then need to convert this to utf16 code unit offsets for Gecko.
   1.103 +        const char *hyphPtr = utf8hyphens.Elements();
   1.104 +        const char16_t *cur = begin + wordStart;
   1.105 +        const char16_t *end = begin + wordLimit;
   1.106 +        while (cur < end) {
   1.107 +          if (*hyphPtr & 0x01) {
   1.108 +            aHyphens[cur - begin] = true;
   1.109 +          }
   1.110 +          cur++;
   1.111 +          if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
   1.112 +              NS_IS_HIGH_SURROGATE(*(cur-1)))
   1.113 +          {
   1.114 +            cur++;
   1.115 +          }
   1.116 +          hyphPtr++;
   1.117 +        }
   1.118 +      }
   1.119 +    }
   1.120 +    
   1.121 +    inWord = false;
   1.122 +  }
   1.123 +
   1.124 +  return NS_OK;
   1.125 +}

mercurial