xpcom/string/src/nsUTF8UtilsSSE2.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 4
michael@0 5 #include "nscore.h"
michael@0 6 #include "nsAlgorithm.h"
michael@0 7 #include <emmintrin.h>
michael@0 8 #include <nsUTF8Utils.h>
michael@0 9
michael@0 10 void
michael@0 11 LossyConvertEncoding16to8::write_sse2(const char16_t* aSource,
michael@0 12 uint32_t aSourceLength)
michael@0 13 {
michael@0 14 char* dest = mDestination;
michael@0 15
michael@0 16 // Align source to a 16-byte boundary.
michael@0 17 uint32_t i = 0;
michael@0 18 uint32_t alignLen =
michael@0 19 XPCOM_MIN<uint32_t>(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t));
michael@0 20 for (; i < alignLen; i++) {
michael@0 21 dest[i] = static_cast<unsigned char>(aSource[i]);
michael@0 22 }
michael@0 23
michael@0 24 // Walk 64 bytes (four XMM registers) at a time.
michael@0 25 __m128i vectmask = _mm_set1_epi16(0x00ff);
michael@0 26 for (; aSourceLength - i > 31; i += 32) {
michael@0 27 __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
michael@0 28 source1 = _mm_and_si128(source1, vectmask);
michael@0 29
michael@0 30 __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
michael@0 31 source2 = _mm_and_si128(source2, vectmask);
michael@0 32
michael@0 33 __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
michael@0 34 source3 = _mm_and_si128(source3, vectmask);
michael@0 35
michael@0 36 __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
michael@0 37 source4 = _mm_and_si128(source4, vectmask);
michael@0 38
michael@0 39
michael@0 40 // Pack the source data. SSE2 views this as a saturating uint16_t to
michael@0 41 // uint8_t conversion, but since we masked off the high-order byte of every
michael@0 42 // uint16_t, we're really just grabbing the low-order bytes of source1 and
michael@0 43 // source2.
michael@0 44 __m128i packed1 = _mm_packus_epi16(source1, source2);
michael@0 45 __m128i packed2 = _mm_packus_epi16(source3, source4);
michael@0 46
michael@0 47 // This store needs to be unaligned since there's no guarantee that the
michael@0 48 // alignment we did above for the source will align the destination.
michael@0 49 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1);
michael@0 50 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
michael@0 51 }
michael@0 52
michael@0 53 // Finish up the rest.
michael@0 54 for (; i < aSourceLength; i++) {
michael@0 55 dest[i] = static_cast<unsigned char>(aSource[i]);
michael@0 56 }
michael@0 57
michael@0 58 mDestination += i;
michael@0 59 }
michael@0 60
michael@0 61 void
michael@0 62 LossyConvertEncoding8to16::write_sse2(const char* aSource,
michael@0 63 uint32_t aSourceLength)
michael@0 64 {
michael@0 65 char16_t *dest = mDestination;
michael@0 66
michael@0 67 // Align source to a 16-byte boundary. We choose to align source rather than
michael@0 68 // dest because we'd rather have our loads than our stores be fast. You have
michael@0 69 // to wait for a load to complete, but you can keep on moving after issuing a
michael@0 70 // store.
michael@0 71 uint32_t i = 0;
michael@0 72 uint32_t alignLen = XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf));
michael@0 73 for (; i < alignLen; i++) {
michael@0 74 dest[i] = static_cast<unsigned char>(aSource[i]);
michael@0 75 }
michael@0 76
michael@0 77 // Walk 32 bytes (two XMM registers) at a time.
michael@0 78 for (; aSourceLength - i > 31; i += 32) {
michael@0 79 __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
michael@0 80 __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
michael@0 81
michael@0 82 // Interleave 0s in with the bytes of source to create lo and hi.
michael@0 83 __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
michael@0 84 __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
michael@0 85 __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
michael@0 86 __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
michael@0 87
michael@0 88 // store lo and hi into dest.
michael@0 89 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1);
michael@0 90 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1);
michael@0 91 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
michael@0 92 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
michael@0 93 }
michael@0 94
michael@0 95 // Finish up whatever's left.
michael@0 96 for (; i < aSourceLength; i++) {
michael@0 97 dest[i] = static_cast<unsigned char>(aSource[i]);
michael@0 98 }
michael@0 99
michael@0 100 mDestination += i;
michael@0 101 }

mercurial