michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: // This file should only be compiled if you're on x86 or x86_64. Additionally, michael@0: // you'll need to compile this file with -msse2 if you're using gcc. michael@0: michael@0: #include michael@0: #include "nscore.h" michael@0: michael@0: namespace mozilla { michael@0: namespace SSE2 { michael@0: michael@0: void michael@0: Convert_ascii_run(const char *&src, michael@0: char16_t *&dst, michael@0: int32_t len) michael@0: { michael@0: if (len > 15) { michael@0: __m128i in, out1, out2; michael@0: __m128d *outp1, *outp2; michael@0: __m128i zeroes; michael@0: uint32_t offset; michael@0: michael@0: // align input to 16 bytes michael@0: while ((NS_PTR_TO_UINT32(src) & 15) && len > 0) { michael@0: if (*src & 0x80U) michael@0: return; michael@0: *dst++ = (char16_t) *src++; michael@0: len--; michael@0: } michael@0: michael@0: zeroes = _mm_setzero_si128(); michael@0: michael@0: offset = NS_PTR_TO_UINT32(dst) & 15; michael@0: michael@0: // Note: all these inner loops have to break, not return; we need michael@0: // to let the single-char loop below catch any leftover michael@0: // byte-at-a-time ASCII chars, since this function must consume michael@0: // all available ASCII chars before it returns michael@0: michael@0: if (offset == 0) { michael@0: while (len > 15) { michael@0: in = _mm_load_si128((__m128i *) src); michael@0: if (_mm_movemask_epi8(in)) michael@0: break; michael@0: out1 = _mm_unpacklo_epi8(in, zeroes); michael@0: out2 = _mm_unpackhi_epi8(in, zeroes); michael@0: _mm_stream_si128((__m128i *) dst, out1); michael@0: _mm_stream_si128((__m128i *) (dst + 8), out2); michael@0: dst += 16; michael@0: src += 16; michael@0: len -= 16; michael@0: } michael@0: } else if (offset == 8) { michael@0: outp1 = (__m128d *) &out1; michael@0: outp2 = (__m128d *) &out2; michael@0: while (len > 15) { michael@0: in = _mm_load_si128((__m128i *) src); michael@0: if (_mm_movemask_epi8(in)) michael@0: break; michael@0: out1 = _mm_unpacklo_epi8(in, zeroes); michael@0: out2 = _mm_unpackhi_epi8(in, zeroes); michael@0: _mm_storel_epi64((__m128i *) dst, out1); michael@0: _mm_storel_epi64((__m128i *) (dst + 8), out2); michael@0: _mm_storeh_pd((double *) (dst + 4), *outp1); michael@0: _mm_storeh_pd((double *) (dst + 12), *outp2); michael@0: src += 16; michael@0: dst += 16; michael@0: len -= 16; michael@0: } michael@0: } else { michael@0: while (len > 15) { michael@0: in = _mm_load_si128((__m128i *) src); michael@0: if (_mm_movemask_epi8(in)) michael@0: break; michael@0: out1 = _mm_unpacklo_epi8(in, zeroes); michael@0: out2 = _mm_unpackhi_epi8(in, zeroes); michael@0: _mm_storeu_si128((__m128i *) dst, out1); michael@0: _mm_storeu_si128((__m128i *) (dst + 8), out2); michael@0: src += 16; michael@0: dst += 16; michael@0: len -= 16; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // finish off a byte at a time michael@0: michael@0: while (len-- > 0 && (*src & 0x80U) == 0) { michael@0: *dst++ = (char16_t) *src++; michael@0: } michael@0: } michael@0: michael@0: } // namespace SSE2 michael@0: } // namespace mozilla