xpcom/string/src/nsUTF8UtilsSSE2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/xpcom/string/src/nsUTF8UtilsSSE2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,101 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +#include "nscore.h"
     1.9 +#include "nsAlgorithm.h"
    1.10 +#include <emmintrin.h>
    1.11 +#include <nsUTF8Utils.h>
    1.12 +
    1.13 +void
    1.14 +LossyConvertEncoding16to8::write_sse2(const char16_t* aSource,
    1.15 +                                      uint32_t aSourceLength)
    1.16 +{
    1.17 +  char* dest = mDestination;
    1.18 +
    1.19 +  // Align source to a 16-byte boundary.
    1.20 +  uint32_t i = 0;
    1.21 +  uint32_t alignLen =
    1.22 +    XPCOM_MIN<uint32_t>(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t));
    1.23 +  for (; i < alignLen; i++) {
    1.24 +    dest[i] = static_cast<unsigned char>(aSource[i]);
    1.25 +  }
    1.26 +
    1.27 +  // Walk 64 bytes (four XMM registers) at a time.
    1.28 +  __m128i vectmask = _mm_set1_epi16(0x00ff);
    1.29 +  for (; aSourceLength - i > 31; i += 32) {
    1.30 +    __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
    1.31 +    source1 = _mm_and_si128(source1, vectmask);
    1.32 +
    1.33 +    __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
    1.34 +    source2 = _mm_and_si128(source2, vectmask);
    1.35 +
    1.36 +    __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
    1.37 +    source3 = _mm_and_si128(source3, vectmask);
    1.38 +
    1.39 +    __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
    1.40 +    source4 = _mm_and_si128(source4, vectmask);
    1.41 +
    1.42 +
    1.43 +    // Pack the source data.  SSE2 views this as a saturating uint16_t to
    1.44 +    // uint8_t conversion, but since we masked off the high-order byte of every
    1.45 +    // uint16_t, we're really just grabbing the low-order bytes of source1 and
    1.46 +    // source2.
    1.47 +    __m128i packed1 = _mm_packus_epi16(source1, source2);
    1.48 +    __m128i packed2 = _mm_packus_epi16(source3, source4);
    1.49 +
    1.50 +    // This store needs to be unaligned since there's no guarantee that the
    1.51 +    // alignment we did above for the source will align the destination.
    1.52 +    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      packed1);
    1.53 +    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
    1.54 +  }
    1.55 +
    1.56 +  // Finish up the rest.
    1.57 +  for (; i < aSourceLength; i++) {
    1.58 +    dest[i] = static_cast<unsigned char>(aSource[i]);
    1.59 +  }
    1.60 +
    1.61 +  mDestination += i;
    1.62 +}
    1.63 +
    1.64 +void
    1.65 +LossyConvertEncoding8to16::write_sse2(const char* aSource,
    1.66 +                                      uint32_t aSourceLength)
    1.67 +{
    1.68 +  char16_t *dest = mDestination;
    1.69 +
    1.70 +  // Align source to a 16-byte boundary.  We choose to align source rather than
    1.71 +  // dest because we'd rather have our loads than our stores be fast. You have
    1.72 +  // to wait for a load to complete, but you can keep on moving after issuing a
    1.73 +  // store.
    1.74 +  uint32_t i = 0;
    1.75 +  uint32_t alignLen = XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf));
    1.76 +  for (; i < alignLen; i++) {
    1.77 +    dest[i] = static_cast<unsigned char>(aSource[i]);
    1.78 +  }
    1.79 +
    1.80 +  // Walk 32 bytes (two XMM registers) at a time.
    1.81 +  for (; aSourceLength - i > 31; i += 32) {
    1.82 +    __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
    1.83 +    __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
    1.84 +
    1.85 +    // Interleave 0s in with the bytes of source to create lo and hi.
    1.86 +    __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
    1.87 +    __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
    1.88 +    __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
    1.89 +    __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
    1.90 +
    1.91 +    // store lo and hi into dest.
    1.92 +    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      lo1);
    1.93 +    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8),  hi1);
    1.94 +    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
    1.95 +    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
    1.96 +  }
    1.97 +
    1.98 +  // Finish up whatever's left.
    1.99 +  for (; i < aSourceLength; i++) {
   1.100 +    dest[i] = static_cast<unsigned char>(aSource[i]);
   1.101 +  }
   1.102 +
   1.103 +  mDestination += i;
   1.104 +}

mercurial