1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/xpcom/string/src/nsUTF8UtilsSSE2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,101 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 + 1.8 +#include "nscore.h" 1.9 +#include "nsAlgorithm.h" 1.10 +#include <emmintrin.h> 1.11 +#include <nsUTF8Utils.h> 1.12 + 1.13 +void 1.14 +LossyConvertEncoding16to8::write_sse2(const char16_t* aSource, 1.15 + uint32_t aSourceLength) 1.16 +{ 1.17 + char* dest = mDestination; 1.18 + 1.19 + // Align source to a 16-byte boundary. 1.20 + uint32_t i = 0; 1.21 + uint32_t alignLen = 1.22 + XPCOM_MIN<uint32_t>(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t)); 1.23 + for (; i < alignLen; i++) { 1.24 + dest[i] = static_cast<unsigned char>(aSource[i]); 1.25 + } 1.26 + 1.27 + // Walk 64 bytes (four XMM registers) at a time. 1.28 + __m128i vectmask = _mm_set1_epi16(0x00ff); 1.29 + for (; aSourceLength - i > 31; i += 32) { 1.30 + __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i)); 1.31 + source1 = _mm_and_si128(source1, vectmask); 1.32 + 1.33 + __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8)); 1.34 + source2 = _mm_and_si128(source2, vectmask); 1.35 + 1.36 + __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16)); 1.37 + source3 = _mm_and_si128(source3, vectmask); 1.38 + 1.39 + __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24)); 1.40 + source4 = _mm_and_si128(source4, vectmask); 1.41 + 1.42 + 1.43 + // Pack the source data. SSE2 views this as a saturating uint16_t to 1.44 + // uint8_t conversion, but since we masked off the high-order byte of every 1.45 + // uint16_t, we're really just grabbing the low-order bytes of source1 and 1.46 + // source2. 1.47 + __m128i packed1 = _mm_packus_epi16(source1, source2); 1.48 + __m128i packed2 = _mm_packus_epi16(source3, source4); 1.49 + 1.50 + // This store needs to be unaligned since there's no guarantee that the 1.51 + // alignment we did above for the source will align the destination. 1.52 + _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1); 1.53 + _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2); 1.54 + } 1.55 + 1.56 + // Finish up the rest. 1.57 + for (; i < aSourceLength; i++) { 1.58 + dest[i] = static_cast<unsigned char>(aSource[i]); 1.59 + } 1.60 + 1.61 + mDestination += i; 1.62 +} 1.63 + 1.64 +void 1.65 +LossyConvertEncoding8to16::write_sse2(const char* aSource, 1.66 + uint32_t aSourceLength) 1.67 +{ 1.68 + char16_t *dest = mDestination; 1.69 + 1.70 + // Align source to a 16-byte boundary. We choose to align source rather than 1.71 + // dest because we'd rather have our loads than our stores be fast. You have 1.72 + // to wait for a load to complete, but you can keep on moving after issuing a 1.73 + // store. 1.74 + uint32_t i = 0; 1.75 + uint32_t alignLen = XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf)); 1.76 + for (; i < alignLen; i++) { 1.77 + dest[i] = static_cast<unsigned char>(aSource[i]); 1.78 + } 1.79 + 1.80 + // Walk 32 bytes (two XMM registers) at a time. 1.81 + for (; aSourceLength - i > 31; i += 32) { 1.82 + __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i)); 1.83 + __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16)); 1.84 + 1.85 + // Interleave 0s in with the bytes of source to create lo and hi. 1.86 + __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); 1.87 + __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128()); 1.88 + __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128()); 1.89 + __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128()); 1.90 + 1.91 + // store lo and hi into dest. 1.92 + _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1); 1.93 + _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1); 1.94 + _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2); 1.95 + _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2); 1.96 + } 1.97 + 1.98 + // Finish up whatever's left. 1.99 + for (; i < aSourceLength; i++) { 1.100 + dest[i] = static_cast<unsigned char>(aSource[i]); 1.101 + } 1.102 + 1.103 + mDestination += i; 1.104 +}