michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nscore.h" michael@0: #include "nsAlgorithm.h" michael@0: #include michael@0: #include michael@0: michael@0: void michael@0: LossyConvertEncoding16to8::write_sse2(const char16_t* aSource, michael@0: uint32_t aSourceLength) michael@0: { michael@0: char* dest = mDestination; michael@0: michael@0: // Align source to a 16-byte boundary. michael@0: uint32_t i = 0; michael@0: uint32_t alignLen = michael@0: XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t)); michael@0: for (; i < alignLen; i++) { michael@0: dest[i] = static_cast(aSource[i]); michael@0: } michael@0: michael@0: // Walk 64 bytes (four XMM registers) at a time. michael@0: __m128i vectmask = _mm_set1_epi16(0x00ff); michael@0: for (; aSourceLength - i > 31; i += 32) { michael@0: __m128i source1 = _mm_load_si128(reinterpret_cast(aSource + i)); michael@0: source1 = _mm_and_si128(source1, vectmask); michael@0: michael@0: __m128i source2 = _mm_load_si128(reinterpret_cast(aSource + i + 8)); michael@0: source2 = _mm_and_si128(source2, vectmask); michael@0: michael@0: __m128i source3 = _mm_load_si128(reinterpret_cast(aSource + i + 16)); michael@0: source3 = _mm_and_si128(source3, vectmask); michael@0: michael@0: __m128i source4 = _mm_load_si128(reinterpret_cast(aSource + i + 24)); michael@0: source4 = _mm_and_si128(source4, vectmask); michael@0: michael@0: michael@0: // Pack the source data. SSE2 views this as a saturating uint16_t to michael@0: // uint8_t conversion, but since we masked off the high-order byte of every michael@0: // uint16_t, we're really just grabbing the low-order bytes of source1 and michael@0: // source2. michael@0: __m128i packed1 = _mm_packus_epi16(source1, source2); michael@0: __m128i packed2 = _mm_packus_epi16(source3, source4); michael@0: michael@0: // This store needs to be unaligned since there's no guarantee that the michael@0: // alignment we did above for the source will align the destination. michael@0: _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1); michael@0: _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2); michael@0: } michael@0: michael@0: // Finish up the rest. michael@0: for (; i < aSourceLength; i++) { michael@0: dest[i] = static_cast(aSource[i]); michael@0: } michael@0: michael@0: mDestination += i; michael@0: } michael@0: michael@0: void michael@0: LossyConvertEncoding8to16::write_sse2(const char* aSource, michael@0: uint32_t aSourceLength) michael@0: { michael@0: char16_t *dest = mDestination; michael@0: michael@0: // Align source to a 16-byte boundary. We choose to align source rather than michael@0: // dest because we'd rather have our loads than our stores be fast. You have michael@0: // to wait for a load to complete, but you can keep on moving after issuing a michael@0: // store. michael@0: uint32_t i = 0; michael@0: uint32_t alignLen = XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf)); michael@0: for (; i < alignLen; i++) { michael@0: dest[i] = static_cast(aSource[i]); michael@0: } michael@0: michael@0: // Walk 32 bytes (two XMM registers) at a time. michael@0: for (; aSourceLength - i > 31; i += 32) { michael@0: __m128i source1 = _mm_load_si128(reinterpret_cast(aSource + i)); michael@0: __m128i source2 = _mm_load_si128(reinterpret_cast(aSource + i + 16)); michael@0: michael@0: // Interleave 0s in with the bytes of source to create lo and hi. michael@0: __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); michael@0: __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128()); michael@0: __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128()); michael@0: __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128()); michael@0: michael@0: // store lo and hi into dest. michael@0: _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1); michael@0: _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1); michael@0: _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2); michael@0: _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2); michael@0: } michael@0: michael@0: // Finish up whatever's left. michael@0: for (; i < aSourceLength; i++) { michael@0: dest[i] = static_cast(aSource[i]); michael@0: } michael@0: michael@0: mDestination += i; michael@0: }