Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 4 | |
michael@0 | 5 | #include "nscore.h" |
michael@0 | 6 | #include "nsAlgorithm.h" |
michael@0 | 7 | #include <emmintrin.h> |
michael@0 | 8 | #include <nsUTF8Utils.h> |
michael@0 | 9 | |
michael@0 | 10 | void |
michael@0 | 11 | LossyConvertEncoding16to8::write_sse2(const char16_t* aSource, |
michael@0 | 12 | uint32_t aSourceLength) |
michael@0 | 13 | { |
michael@0 | 14 | char* dest = mDestination; |
michael@0 | 15 | |
michael@0 | 16 | // Align source to a 16-byte boundary. |
michael@0 | 17 | uint32_t i = 0; |
michael@0 | 18 | uint32_t alignLen = |
michael@0 | 19 | XPCOM_MIN<uint32_t>(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t)); |
michael@0 | 20 | for (; i < alignLen; i++) { |
michael@0 | 21 | dest[i] = static_cast<unsigned char>(aSource[i]); |
michael@0 | 22 | } |
michael@0 | 23 | |
michael@0 | 24 | // Walk 64 bytes (four XMM registers) at a time. |
michael@0 | 25 | __m128i vectmask = _mm_set1_epi16(0x00ff); |
michael@0 | 26 | for (; aSourceLength - i > 31; i += 32) { |
michael@0 | 27 | __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i)); |
michael@0 | 28 | source1 = _mm_and_si128(source1, vectmask); |
michael@0 | 29 | |
michael@0 | 30 | __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8)); |
michael@0 | 31 | source2 = _mm_and_si128(source2, vectmask); |
michael@0 | 32 | |
michael@0 | 33 | __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16)); |
michael@0 | 34 | source3 = _mm_and_si128(source3, vectmask); |
michael@0 | 35 | |
michael@0 | 36 | __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24)); |
michael@0 | 37 | source4 = _mm_and_si128(source4, vectmask); |
michael@0 | 38 | |
michael@0 | 39 | |
michael@0 | 40 | // Pack the source data. SSE2 views this as a saturating uint16_t to |
michael@0 | 41 | // uint8_t conversion, but since we masked off the high-order byte of every |
michael@0 | 42 | // uint16_t, we're really just grabbing the low-order bytes of source1 and |
michael@0 | 43 | // source2. |
michael@0 | 44 | __m128i packed1 = _mm_packus_epi16(source1, source2); |
michael@0 | 45 | __m128i packed2 = _mm_packus_epi16(source3, source4); |
michael@0 | 46 | |
michael@0 | 47 | // This store needs to be unaligned since there's no guarantee that the |
michael@0 | 48 | // alignment we did above for the source will align the destination. |
michael@0 | 49 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1); |
michael@0 | 50 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2); |
michael@0 | 51 | } |
michael@0 | 52 | |
michael@0 | 53 | // Finish up the rest. |
michael@0 | 54 | for (; i < aSourceLength; i++) { |
michael@0 | 55 | dest[i] = static_cast<unsigned char>(aSource[i]); |
michael@0 | 56 | } |
michael@0 | 57 | |
michael@0 | 58 | mDestination += i; |
michael@0 | 59 | } |
michael@0 | 60 | |
michael@0 | 61 | void |
michael@0 | 62 | LossyConvertEncoding8to16::write_sse2(const char* aSource, |
michael@0 | 63 | uint32_t aSourceLength) |
michael@0 | 64 | { |
michael@0 | 65 | char16_t *dest = mDestination; |
michael@0 | 66 | |
michael@0 | 67 | // Align source to a 16-byte boundary. We choose to align source rather than |
michael@0 | 68 | // dest because we'd rather have our loads than our stores be fast. You have |
michael@0 | 69 | // to wait for a load to complete, but you can keep on moving after issuing a |
michael@0 | 70 | // store. |
michael@0 | 71 | uint32_t i = 0; |
michael@0 | 72 | uint32_t alignLen = XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf)); |
michael@0 | 73 | for (; i < alignLen; i++) { |
michael@0 | 74 | dest[i] = static_cast<unsigned char>(aSource[i]); |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | // Walk 32 bytes (two XMM registers) at a time. |
michael@0 | 78 | for (; aSourceLength - i > 31; i += 32) { |
michael@0 | 79 | __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i)); |
michael@0 | 80 | __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16)); |
michael@0 | 81 | |
michael@0 | 82 | // Interleave 0s in with the bytes of source to create lo and hi. |
michael@0 | 83 | __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); |
michael@0 | 84 | __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128()); |
michael@0 | 85 | __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128()); |
michael@0 | 86 | __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128()); |
michael@0 | 87 | |
michael@0 | 88 | // store lo and hi into dest. |
michael@0 | 89 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1); |
michael@0 | 90 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1); |
michael@0 | 91 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2); |
michael@0 | 92 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2); |
michael@0 | 93 | } |
michael@0 | 94 | |
michael@0 | 95 | // Finish up whatever's left. |
michael@0 | 96 | for (; i < aSourceLength; i++) { |
michael@0 | 97 | dest[i] = static_cast<unsigned char>(aSource[i]); |
michael@0 | 98 | } |
michael@0 | 99 | |
michael@0 | 100 | mDestination += i; |
michael@0 | 101 | } |