Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "libyuv/row.h" |
michael@0 | 12 | |
michael@0 | 13 | #ifdef __cplusplus |
michael@0 | 14 | namespace libyuv { |
michael@0 | 15 | extern "C" { |
michael@0 | 16 | #endif |
michael@0 | 17 | |
michael@0 | 18 | // This module is for Visual C x86. |
michael@0 | 19 | #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
michael@0 | 20 | |
michael@0 | 21 | // Offsets for source bytes 0 to 9 |
michael@0 | 22 | static uvec8 kShuf0 = |
michael@0 | 23 | { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 24 | |
michael@0 | 25 | // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
michael@0 | 26 | static uvec8 kShuf1 = |
michael@0 | 27 | { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 28 | |
michael@0 | 29 | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
michael@0 | 30 | static uvec8 kShuf2 = |
michael@0 | 31 | { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 32 | |
michael@0 | 33 | // Offsets for source bytes 0 to 10 |
michael@0 | 34 | static uvec8 kShuf01 = |
michael@0 | 35 | { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; |
michael@0 | 36 | |
michael@0 | 37 | // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
michael@0 | 38 | static uvec8 kShuf11 = |
michael@0 | 39 | { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; |
michael@0 | 40 | |
michael@0 | 41 | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
michael@0 | 42 | static uvec8 kShuf21 = |
michael@0 | 43 | { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; |
michael@0 | 44 | |
michael@0 | 45 | // Coefficients for source bytes 0 to 10 |
michael@0 | 46 | static uvec8 kMadd01 = |
michael@0 | 47 | { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; |
michael@0 | 48 | |
michael@0 | 49 | // Coefficients for source bytes 10 to 21 |
michael@0 | 50 | static uvec8 kMadd11 = |
michael@0 | 51 | { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; |
michael@0 | 52 | |
michael@0 | 53 | // Coefficients for source bytes 21 to 31 |
michael@0 | 54 | static uvec8 kMadd21 = |
michael@0 | 55 | { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; |
michael@0 | 56 | |
michael@0 | 57 | // Coefficients for source bytes 21 to 31 |
michael@0 | 58 | static vec16 kRound34 = |
michael@0 | 59 | { 2, 2, 2, 2, 2, 2, 2, 2 }; |
michael@0 | 60 | |
michael@0 | 61 | static uvec8 kShuf38a = |
michael@0 | 62 | { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 63 | |
michael@0 | 64 | static uvec8 kShuf38b = |
michael@0 | 65 | { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; |
michael@0 | 66 | |
michael@0 | 67 | // Arrange words 0,3,6 into 0,1,2 |
michael@0 | 68 | static uvec8 kShufAc = |
michael@0 | 69 | { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 70 | |
michael@0 | 71 | // Arrange words 0,3,6 into 3,4,5 |
michael@0 | 72 | static uvec8 kShufAc3 = |
michael@0 | 73 | { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; |
michael@0 | 74 | |
michael@0 | 75 | // Scaling values for boxes of 3x3 and 2x3 |
michael@0 | 76 | static uvec16 kScaleAc33 = |
michael@0 | 77 | { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; |
michael@0 | 78 | |
michael@0 | 79 | // Arrange first value for pixels 0,1,2,3,4,5 |
michael@0 | 80 | static uvec8 kShufAb0 = |
michael@0 | 81 | { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; |
michael@0 | 82 | |
michael@0 | 83 | // Arrange second value for pixels 0,1,2,3,4,5 |
michael@0 | 84 | static uvec8 kShufAb1 = |
michael@0 | 85 | { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; |
michael@0 | 86 | |
michael@0 | 87 | // Arrange third value for pixels 0,1,2,3,4,5 |
michael@0 | 88 | static uvec8 kShufAb2 = |
michael@0 | 89 | { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 90 | |
michael@0 | 91 | // Scaling values for boxes of 3x2 and 2x2 |
michael@0 | 92 | static uvec16 kScaleAb2 = |
michael@0 | 93 | { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
michael@0 | 94 | |
michael@0 | 95 | // Reads 32 pixels, throws half away and writes 16 pixels. |
michael@0 | 96 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
michael@0 | 97 | __declspec(naked) __declspec(align(16)) |
michael@0 | 98 | void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 99 | uint8* dst_ptr, int dst_width) { |
michael@0 | 100 | __asm { |
michael@0 | 101 | mov eax, [esp + 4] // src_ptr |
michael@0 | 102 | // src_stride ignored |
michael@0 | 103 | mov edx, [esp + 12] // dst_ptr |
michael@0 | 104 | mov ecx, [esp + 16] // dst_width |
michael@0 | 105 | |
michael@0 | 106 | align 4 |
michael@0 | 107 | wloop: |
michael@0 | 108 | movdqa xmm0, [eax] |
michael@0 | 109 | movdqa xmm1, [eax + 16] |
michael@0 | 110 | lea eax, [eax + 32] |
michael@0 | 111 | psrlw xmm0, 8 // isolate odd pixels. |
michael@0 | 112 | psrlw xmm1, 8 |
michael@0 | 113 | packuswb xmm0, xmm1 |
michael@0 | 114 | sub ecx, 16 |
michael@0 | 115 | movdqa [edx], xmm0 |
michael@0 | 116 | lea edx, [edx + 16] |
michael@0 | 117 | jg wloop |
michael@0 | 118 | |
michael@0 | 119 | ret |
michael@0 | 120 | } |
michael@0 | 121 | } |
michael@0 | 122 | |
michael@0 | 123 | // Blends 32x1 rectangle to 16x1. |
michael@0 | 124 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
michael@0 | 125 | __declspec(naked) __declspec(align(16)) |
michael@0 | 126 | void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 127 | uint8* dst_ptr, int dst_width) { |
michael@0 | 128 | __asm { |
michael@0 | 129 | mov eax, [esp + 4] // src_ptr |
michael@0 | 130 | // src_stride |
michael@0 | 131 | mov edx, [esp + 12] // dst_ptr |
michael@0 | 132 | mov ecx, [esp + 16] // dst_width |
michael@0 | 133 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 134 | psrlw xmm5, 8 |
michael@0 | 135 | |
michael@0 | 136 | align 4 |
michael@0 | 137 | wloop: |
michael@0 | 138 | movdqa xmm0, [eax] |
michael@0 | 139 | movdqa xmm1, [eax + 16] |
michael@0 | 140 | lea eax, [eax + 32] |
michael@0 | 141 | |
michael@0 | 142 | movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
michael@0 | 143 | psrlw xmm0, 8 |
michael@0 | 144 | movdqa xmm3, xmm1 |
michael@0 | 145 | psrlw xmm1, 8 |
michael@0 | 146 | pand xmm2, xmm5 |
michael@0 | 147 | pand xmm3, xmm5 |
michael@0 | 148 | pavgw xmm0, xmm2 |
michael@0 | 149 | pavgw xmm1, xmm3 |
michael@0 | 150 | packuswb xmm0, xmm1 |
michael@0 | 151 | |
michael@0 | 152 | sub ecx, 16 |
michael@0 | 153 | movdqa [edx], xmm0 |
michael@0 | 154 | lea edx, [edx + 16] |
michael@0 | 155 | jg wloop |
michael@0 | 156 | |
michael@0 | 157 | ret |
michael@0 | 158 | } |
michael@0 | 159 | } |
michael@0 | 160 | |
michael@0 | 161 | // Blends 32x2 rectangle to 16x1. |
michael@0 | 162 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
michael@0 | 163 | __declspec(naked) __declspec(align(16)) |
michael@0 | 164 | void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 165 | uint8* dst_ptr, int dst_width) { |
michael@0 | 166 | __asm { |
michael@0 | 167 | push esi |
michael@0 | 168 | mov eax, [esp + 4 + 4] // src_ptr |
michael@0 | 169 | mov esi, [esp + 4 + 8] // src_stride |
michael@0 | 170 | mov edx, [esp + 4 + 12] // dst_ptr |
michael@0 | 171 | mov ecx, [esp + 4 + 16] // dst_width |
michael@0 | 172 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 173 | psrlw xmm5, 8 |
michael@0 | 174 | |
michael@0 | 175 | align 4 |
michael@0 | 176 | wloop: |
michael@0 | 177 | movdqa xmm0, [eax] |
michael@0 | 178 | movdqa xmm1, [eax + 16] |
michael@0 | 179 | movdqa xmm2, [eax + esi] |
michael@0 | 180 | movdqa xmm3, [eax + esi + 16] |
michael@0 | 181 | lea eax, [eax + 32] |
michael@0 | 182 | pavgb xmm0, xmm2 // average rows |
michael@0 | 183 | pavgb xmm1, xmm3 |
michael@0 | 184 | |
michael@0 | 185 | movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
michael@0 | 186 | psrlw xmm0, 8 |
michael@0 | 187 | movdqa xmm3, xmm1 |
michael@0 | 188 | psrlw xmm1, 8 |
michael@0 | 189 | pand xmm2, xmm5 |
michael@0 | 190 | pand xmm3, xmm5 |
michael@0 | 191 | pavgw xmm0, xmm2 |
michael@0 | 192 | pavgw xmm1, xmm3 |
michael@0 | 193 | packuswb xmm0, xmm1 |
michael@0 | 194 | |
michael@0 | 195 | sub ecx, 16 |
michael@0 | 196 | movdqa [edx], xmm0 |
michael@0 | 197 | lea edx, [edx + 16] |
michael@0 | 198 | jg wloop |
michael@0 | 199 | |
michael@0 | 200 | pop esi |
michael@0 | 201 | ret |
michael@0 | 202 | } |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | // Reads 32 pixels, throws half away and writes 16 pixels. |
michael@0 | 206 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
michael@0 | 207 | __declspec(naked) __declspec(align(16)) |
michael@0 | 208 | void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, |
michael@0 | 209 | ptrdiff_t src_stride, |
michael@0 | 210 | uint8* dst_ptr, int dst_width) { |
michael@0 | 211 | __asm { |
michael@0 | 212 | mov eax, [esp + 4] // src_ptr |
michael@0 | 213 | // src_stride ignored |
michael@0 | 214 | mov edx, [esp + 12] // dst_ptr |
michael@0 | 215 | mov ecx, [esp + 16] // dst_width |
michael@0 | 216 | |
michael@0 | 217 | align 4 |
michael@0 | 218 | wloop: |
michael@0 | 219 | movdqu xmm0, [eax] |
michael@0 | 220 | movdqu xmm1, [eax + 16] |
michael@0 | 221 | lea eax, [eax + 32] |
michael@0 | 222 | psrlw xmm0, 8 // isolate odd pixels. |
michael@0 | 223 | psrlw xmm1, 8 |
michael@0 | 224 | packuswb xmm0, xmm1 |
michael@0 | 225 | sub ecx, 16 |
michael@0 | 226 | movdqu [edx], xmm0 |
michael@0 | 227 | lea edx, [edx + 16] |
michael@0 | 228 | jg wloop |
michael@0 | 229 | |
michael@0 | 230 | ret |
michael@0 | 231 | } |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | // Blends 32x1 rectangle to 16x1. |
michael@0 | 235 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
michael@0 | 236 | __declspec(naked) __declspec(align(16)) |
michael@0 | 237 | void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, |
michael@0 | 238 | ptrdiff_t src_stride, |
michael@0 | 239 | uint8* dst_ptr, int dst_width) { |
michael@0 | 240 | __asm { |
michael@0 | 241 | mov eax, [esp + 4] // src_ptr |
michael@0 | 242 | // src_stride |
michael@0 | 243 | mov edx, [esp + 12] // dst_ptr |
michael@0 | 244 | mov ecx, [esp + 16] // dst_width |
michael@0 | 245 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 246 | psrlw xmm5, 8 |
michael@0 | 247 | |
michael@0 | 248 | align 4 |
michael@0 | 249 | wloop: |
michael@0 | 250 | movdqu xmm0, [eax] |
michael@0 | 251 | movdqu xmm1, [eax + 16] |
michael@0 | 252 | lea eax, [eax + 32] |
michael@0 | 253 | |
michael@0 | 254 | movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
michael@0 | 255 | psrlw xmm0, 8 |
michael@0 | 256 | movdqa xmm3, xmm1 |
michael@0 | 257 | psrlw xmm1, 8 |
michael@0 | 258 | pand xmm2, xmm5 |
michael@0 | 259 | pand xmm3, xmm5 |
michael@0 | 260 | pavgw xmm0, xmm2 |
michael@0 | 261 | pavgw xmm1, xmm3 |
michael@0 | 262 | packuswb xmm0, xmm1 |
michael@0 | 263 | |
michael@0 | 264 | sub ecx, 16 |
michael@0 | 265 | movdqu [edx], xmm0 |
michael@0 | 266 | lea edx, [edx + 16] |
michael@0 | 267 | jg wloop |
michael@0 | 268 | |
michael@0 | 269 | ret |
michael@0 | 270 | } |
michael@0 | 271 | } |
michael@0 | 272 | |
michael@0 | 273 | // Blends 32x2 rectangle to 16x1. |
michael@0 | 274 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
michael@0 | 275 | __declspec(naked) __declspec(align(16)) |
michael@0 | 276 | void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, |
michael@0 | 277 | ptrdiff_t src_stride, |
michael@0 | 278 | uint8* dst_ptr, int dst_width) { |
michael@0 | 279 | __asm { |
michael@0 | 280 | push esi |
michael@0 | 281 | mov eax, [esp + 4 + 4] // src_ptr |
michael@0 | 282 | mov esi, [esp + 4 + 8] // src_stride |
michael@0 | 283 | mov edx, [esp + 4 + 12] // dst_ptr |
michael@0 | 284 | mov ecx, [esp + 4 + 16] // dst_width |
michael@0 | 285 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 286 | psrlw xmm5, 8 |
michael@0 | 287 | |
michael@0 | 288 | align 4 |
michael@0 | 289 | wloop: |
michael@0 | 290 | movdqu xmm0, [eax] |
michael@0 | 291 | movdqu xmm1, [eax + 16] |
michael@0 | 292 | movdqu xmm2, [eax + esi] |
michael@0 | 293 | movdqu xmm3, [eax + esi + 16] |
michael@0 | 294 | lea eax, [eax + 32] |
michael@0 | 295 | pavgb xmm0, xmm2 // average rows |
michael@0 | 296 | pavgb xmm1, xmm3 |
michael@0 | 297 | |
michael@0 | 298 | movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
michael@0 | 299 | psrlw xmm0, 8 |
michael@0 | 300 | movdqa xmm3, xmm1 |
michael@0 | 301 | psrlw xmm1, 8 |
michael@0 | 302 | pand xmm2, xmm5 |
michael@0 | 303 | pand xmm3, xmm5 |
michael@0 | 304 | pavgw xmm0, xmm2 |
michael@0 | 305 | pavgw xmm1, xmm3 |
michael@0 | 306 | packuswb xmm0, xmm1 |
michael@0 | 307 | |
michael@0 | 308 | sub ecx, 16 |
michael@0 | 309 | movdqu [edx], xmm0 |
michael@0 | 310 | lea edx, [edx + 16] |
michael@0 | 311 | jg wloop |
michael@0 | 312 | |
michael@0 | 313 | pop esi |
michael@0 | 314 | ret |
michael@0 | 315 | } |
michael@0 | 316 | } |
michael@0 | 317 | |
michael@0 | 318 | // Point samples 32 pixels to 8 pixels. |
michael@0 | 319 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
michael@0 | 320 | __declspec(naked) __declspec(align(16)) |
michael@0 | 321 | void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 322 | uint8* dst_ptr, int dst_width) { |
michael@0 | 323 | __asm { |
michael@0 | 324 | mov eax, [esp + 4] // src_ptr |
michael@0 | 325 | // src_stride ignored |
michael@0 | 326 | mov edx, [esp + 12] // dst_ptr |
michael@0 | 327 | mov ecx, [esp + 16] // dst_width |
michael@0 | 328 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 |
michael@0 | 329 | psrld xmm5, 24 |
michael@0 | 330 | pslld xmm5, 16 |
michael@0 | 331 | |
michael@0 | 332 | align 4 |
michael@0 | 333 | wloop: |
michael@0 | 334 | movdqa xmm0, [eax] |
michael@0 | 335 | movdqa xmm1, [eax + 16] |
michael@0 | 336 | lea eax, [eax + 32] |
michael@0 | 337 | pand xmm0, xmm5 |
michael@0 | 338 | pand xmm1, xmm5 |
michael@0 | 339 | packuswb xmm0, xmm1 |
michael@0 | 340 | psrlw xmm0, 8 |
michael@0 | 341 | packuswb xmm0, xmm0 |
michael@0 | 342 | sub ecx, 8 |
michael@0 | 343 | movq qword ptr [edx], xmm0 |
michael@0 | 344 | lea edx, [edx + 8] |
michael@0 | 345 | jg wloop |
michael@0 | 346 | |
michael@0 | 347 | ret |
michael@0 | 348 | } |
michael@0 | 349 | } |
michael@0 | 350 | |
michael@0 | 351 | // Blends 32x4 rectangle to 8x1. |
michael@0 | 352 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
michael@0 | 353 | __declspec(naked) __declspec(align(16)) |
michael@0 | 354 | void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 355 | uint8* dst_ptr, int dst_width) { |
michael@0 | 356 | __asm { |
michael@0 | 357 | push esi |
michael@0 | 358 | push edi |
michael@0 | 359 | mov eax, [esp + 8 + 4] // src_ptr |
michael@0 | 360 | mov esi, [esp + 8 + 8] // src_stride |
michael@0 | 361 | mov edx, [esp + 8 + 12] // dst_ptr |
michael@0 | 362 | mov ecx, [esp + 8 + 16] // dst_width |
michael@0 | 363 | lea edi, [esi + esi * 2] // src_stride * 3 |
michael@0 | 364 | pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff |
michael@0 | 365 | psrlw xmm7, 8 |
michael@0 | 366 | |
michael@0 | 367 | align 4 |
michael@0 | 368 | wloop: |
michael@0 | 369 | movdqa xmm0, [eax] |
michael@0 | 370 | movdqa xmm1, [eax + 16] |
michael@0 | 371 | movdqa xmm2, [eax + esi] |
michael@0 | 372 | movdqa xmm3, [eax + esi + 16] |
michael@0 | 373 | pavgb xmm0, xmm2 // average rows |
michael@0 | 374 | pavgb xmm1, xmm3 |
michael@0 | 375 | movdqa xmm2, [eax + esi * 2] |
michael@0 | 376 | movdqa xmm3, [eax + esi * 2 + 16] |
michael@0 | 377 | movdqa xmm4, [eax + edi] |
michael@0 | 378 | movdqa xmm5, [eax + edi + 16] |
michael@0 | 379 | lea eax, [eax + 32] |
michael@0 | 380 | pavgb xmm2, xmm4 |
michael@0 | 381 | pavgb xmm3, xmm5 |
michael@0 | 382 | pavgb xmm0, xmm2 |
michael@0 | 383 | pavgb xmm1, xmm3 |
michael@0 | 384 | |
michael@0 | 385 | movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
michael@0 | 386 | psrlw xmm0, 8 |
michael@0 | 387 | movdqa xmm3, xmm1 |
michael@0 | 388 | psrlw xmm1, 8 |
michael@0 | 389 | pand xmm2, xmm7 |
michael@0 | 390 | pand xmm3, xmm7 |
michael@0 | 391 | pavgw xmm0, xmm2 |
michael@0 | 392 | pavgw xmm1, xmm3 |
michael@0 | 393 | packuswb xmm0, xmm1 |
michael@0 | 394 | |
michael@0 | 395 | movdqa xmm2, xmm0 // average columns (16 to 8 pixels) |
michael@0 | 396 | psrlw xmm0, 8 |
michael@0 | 397 | pand xmm2, xmm7 |
michael@0 | 398 | pavgw xmm0, xmm2 |
michael@0 | 399 | packuswb xmm0, xmm0 |
michael@0 | 400 | |
michael@0 | 401 | sub ecx, 8 |
michael@0 | 402 | movq qword ptr [edx], xmm0 |
michael@0 | 403 | lea edx, [edx + 8] |
michael@0 | 404 | jg wloop |
michael@0 | 405 | |
michael@0 | 406 | pop edi |
michael@0 | 407 | pop esi |
michael@0 | 408 | ret |
michael@0 | 409 | } |
michael@0 | 410 | } |
michael@0 | 411 | |
michael@0 | 412 | // Point samples 32 pixels to 24 pixels. |
michael@0 | 413 | // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
michael@0 | 414 | // Then shuffled to do the scaling. |
michael@0 | 415 | |
michael@0 | 416 | // Note that movdqa+palign may be better than movdqu. |
michael@0 | 417 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
michael@0 | 418 | __declspec(naked) __declspec(align(16)) |
michael@0 | 419 | void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 420 | uint8* dst_ptr, int dst_width) { |
michael@0 | 421 | __asm { |
michael@0 | 422 | mov eax, [esp + 4] // src_ptr |
michael@0 | 423 | // src_stride ignored |
michael@0 | 424 | mov edx, [esp + 12] // dst_ptr |
michael@0 | 425 | mov ecx, [esp + 16] // dst_width |
michael@0 | 426 | movdqa xmm3, kShuf0 |
michael@0 | 427 | movdqa xmm4, kShuf1 |
michael@0 | 428 | movdqa xmm5, kShuf2 |
michael@0 | 429 | |
michael@0 | 430 | align 4 |
michael@0 | 431 | wloop: |
michael@0 | 432 | movdqa xmm0, [eax] |
michael@0 | 433 | movdqa xmm1, [eax + 16] |
michael@0 | 434 | lea eax, [eax + 32] |
michael@0 | 435 | movdqa xmm2, xmm1 |
michael@0 | 436 | palignr xmm1, xmm0, 8 |
michael@0 | 437 | pshufb xmm0, xmm3 |
michael@0 | 438 | pshufb xmm1, xmm4 |
michael@0 | 439 | pshufb xmm2, xmm5 |
michael@0 | 440 | movq qword ptr [edx], xmm0 |
michael@0 | 441 | movq qword ptr [edx + 8], xmm1 |
michael@0 | 442 | movq qword ptr [edx + 16], xmm2 |
michael@0 | 443 | lea edx, [edx + 24] |
michael@0 | 444 | sub ecx, 24 |
michael@0 | 445 | jg wloop |
michael@0 | 446 | |
michael@0 | 447 | ret |
michael@0 | 448 | } |
michael@0 | 449 | } |
michael@0 | 450 | |
michael@0 | 451 | // Blends 32x2 rectangle to 24x1 |
michael@0 | 452 | // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
michael@0 | 453 | // Then shuffled to do the scaling. |
michael@0 | 454 | |
michael@0 | 455 | // Register usage: |
michael@0 | 456 | // xmm0 src_row 0 |
michael@0 | 457 | // xmm1 src_row 1 |
michael@0 | 458 | // xmm2 shuf 0 |
michael@0 | 459 | // xmm3 shuf 1 |
michael@0 | 460 | // xmm4 shuf 2 |
michael@0 | 461 | // xmm5 madd 0 |
michael@0 | 462 | // xmm6 madd 1 |
michael@0 | 463 | // xmm7 kRound34 |
michael@0 | 464 | |
michael@0 | 465 | // Note that movdqa+palign may be better than movdqu. |
michael@0 | 466 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
michael@0 | 467 | __declspec(naked) __declspec(align(16)) |
michael@0 | 468 | void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 469 | ptrdiff_t src_stride, |
michael@0 | 470 | uint8* dst_ptr, int dst_width) { |
michael@0 | 471 | __asm { |
michael@0 | 472 | push esi |
michael@0 | 473 | mov eax, [esp + 4 + 4] // src_ptr |
michael@0 | 474 | mov esi, [esp + 4 + 8] // src_stride |
michael@0 | 475 | mov edx, [esp + 4 + 12] // dst_ptr |
michael@0 | 476 | mov ecx, [esp + 4 + 16] // dst_width |
michael@0 | 477 | movdqa xmm2, kShuf01 |
michael@0 | 478 | movdqa xmm3, kShuf11 |
michael@0 | 479 | movdqa xmm4, kShuf21 |
michael@0 | 480 | movdqa xmm5, kMadd01 |
michael@0 | 481 | movdqa xmm6, kMadd11 |
michael@0 | 482 | movdqa xmm7, kRound34 |
michael@0 | 483 | |
michael@0 | 484 | align 4 |
michael@0 | 485 | wloop: |
michael@0 | 486 | movdqa xmm0, [eax] // pixels 0..7 |
michael@0 | 487 | movdqa xmm1, [eax + esi] |
michael@0 | 488 | pavgb xmm0, xmm1 |
michael@0 | 489 | pshufb xmm0, xmm2 |
michael@0 | 490 | pmaddubsw xmm0, xmm5 |
michael@0 | 491 | paddsw xmm0, xmm7 |
michael@0 | 492 | psrlw xmm0, 2 |
michael@0 | 493 | packuswb xmm0, xmm0 |
michael@0 | 494 | movq qword ptr [edx], xmm0 |
michael@0 | 495 | movdqu xmm0, [eax + 8] // pixels 8..15 |
michael@0 | 496 | movdqu xmm1, [eax + esi + 8] |
michael@0 | 497 | pavgb xmm0, xmm1 |
michael@0 | 498 | pshufb xmm0, xmm3 |
michael@0 | 499 | pmaddubsw xmm0, xmm6 |
michael@0 | 500 | paddsw xmm0, xmm7 |
michael@0 | 501 | psrlw xmm0, 2 |
michael@0 | 502 | packuswb xmm0, xmm0 |
michael@0 | 503 | movq qword ptr [edx + 8], xmm0 |
michael@0 | 504 | movdqa xmm0, [eax + 16] // pixels 16..23 |
michael@0 | 505 | movdqa xmm1, [eax + esi + 16] |
michael@0 | 506 | lea eax, [eax + 32] |
michael@0 | 507 | pavgb xmm0, xmm1 |
michael@0 | 508 | pshufb xmm0, xmm4 |
michael@0 | 509 | movdqa xmm1, kMadd21 |
michael@0 | 510 | pmaddubsw xmm0, xmm1 |
michael@0 | 511 | paddsw xmm0, xmm7 |
michael@0 | 512 | psrlw xmm0, 2 |
michael@0 | 513 | packuswb xmm0, xmm0 |
michael@0 | 514 | sub ecx, 24 |
michael@0 | 515 | movq qword ptr [edx + 16], xmm0 |
michael@0 | 516 | lea edx, [edx + 24] |
michael@0 | 517 | jg wloop |
michael@0 | 518 | |
michael@0 | 519 | pop esi |
michael@0 | 520 | ret |
michael@0 | 521 | } |
michael@0 | 522 | } |
michael@0 | 523 | |
michael@0 | 524 | // Note that movdqa+palign may be better than movdqu. |
michael@0 | 525 | // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
michael@0 | 526 | __declspec(naked) __declspec(align(16)) |
michael@0 | 527 | void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 528 | ptrdiff_t src_stride, |
michael@0 | 529 | uint8* dst_ptr, int dst_width) { |
michael@0 | 530 | __asm { |
michael@0 | 531 | push esi |
michael@0 | 532 | mov eax, [esp + 4 + 4] // src_ptr |
michael@0 | 533 | mov esi, [esp + 4 + 8] // src_stride |
michael@0 | 534 | mov edx, [esp + 4 + 12] // dst_ptr |
michael@0 | 535 | mov ecx, [esp + 4 + 16] // dst_width |
michael@0 | 536 | movdqa xmm2, kShuf01 |
michael@0 | 537 | movdqa xmm3, kShuf11 |
michael@0 | 538 | movdqa xmm4, kShuf21 |
michael@0 | 539 | movdqa xmm5, kMadd01 |
michael@0 | 540 | movdqa xmm6, kMadd11 |
michael@0 | 541 | movdqa xmm7, kRound34 |
michael@0 | 542 | |
michael@0 | 543 | align 4 |
michael@0 | 544 | wloop: |
michael@0 | 545 | movdqa xmm0, [eax] // pixels 0..7 |
michael@0 | 546 | movdqa xmm1, [eax + esi] |
michael@0 | 547 | pavgb xmm1, xmm0 |
michael@0 | 548 | pavgb xmm0, xmm1 |
michael@0 | 549 | pshufb xmm0, xmm2 |
michael@0 | 550 | pmaddubsw xmm0, xmm5 |
michael@0 | 551 | paddsw xmm0, xmm7 |
michael@0 | 552 | psrlw xmm0, 2 |
michael@0 | 553 | packuswb xmm0, xmm0 |
michael@0 | 554 | movq qword ptr [edx], xmm0 |
michael@0 | 555 | movdqu xmm0, [eax + 8] // pixels 8..15 |
michael@0 | 556 | movdqu xmm1, [eax + esi + 8] |
michael@0 | 557 | pavgb xmm1, xmm0 |
michael@0 | 558 | pavgb xmm0, xmm1 |
michael@0 | 559 | pshufb xmm0, xmm3 |
michael@0 | 560 | pmaddubsw xmm0, xmm6 |
michael@0 | 561 | paddsw xmm0, xmm7 |
michael@0 | 562 | psrlw xmm0, 2 |
michael@0 | 563 | packuswb xmm0, xmm0 |
michael@0 | 564 | movq qword ptr [edx + 8], xmm0 |
michael@0 | 565 | movdqa xmm0, [eax + 16] // pixels 16..23 |
michael@0 | 566 | movdqa xmm1, [eax + esi + 16] |
michael@0 | 567 | lea eax, [eax + 32] |
michael@0 | 568 | pavgb xmm1, xmm0 |
michael@0 | 569 | pavgb xmm0, xmm1 |
michael@0 | 570 | pshufb xmm0, xmm4 |
michael@0 | 571 | movdqa xmm1, kMadd21 |
michael@0 | 572 | pmaddubsw xmm0, xmm1 |
michael@0 | 573 | paddsw xmm0, xmm7 |
michael@0 | 574 | psrlw xmm0, 2 |
michael@0 | 575 | packuswb xmm0, xmm0 |
michael@0 | 576 | sub ecx, 24 |
michael@0 | 577 | movq qword ptr [edx + 16], xmm0 |
michael@0 | 578 | lea edx, [edx+24] |
michael@0 | 579 | jg wloop |
michael@0 | 580 | |
michael@0 | 581 | pop esi |
michael@0 | 582 | ret |
michael@0 | 583 | } |
michael@0 | 584 | } |
michael@0 | 585 | |
michael@0 | 586 | // 3/8 point sampler |
michael@0 | 587 | |
michael@0 | 588 | // Scale 32 pixels to 12 |
michael@0 | 589 | __declspec(naked) __declspec(align(16)) |
michael@0 | 590 | void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 591 | uint8* dst_ptr, int dst_width) { |
michael@0 | 592 | __asm { |
michael@0 | 593 | mov eax, [esp + 4] // src_ptr |
michael@0 | 594 | // src_stride ignored |
michael@0 | 595 | mov edx, [esp + 12] // dst_ptr |
michael@0 | 596 | mov ecx, [esp + 16] // dst_width |
michael@0 | 597 | movdqa xmm4, kShuf38a |
michael@0 | 598 | movdqa xmm5, kShuf38b |
michael@0 | 599 | |
michael@0 | 600 | align 4 |
michael@0 | 601 | xloop: |
michael@0 | 602 | movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 |
michael@0 | 603 | movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 |
michael@0 | 604 | lea eax, [eax + 32] |
michael@0 | 605 | pshufb xmm0, xmm4 |
michael@0 | 606 | pshufb xmm1, xmm5 |
michael@0 | 607 | paddusb xmm0, xmm1 |
michael@0 | 608 | |
michael@0 | 609 | sub ecx, 12 |
michael@0 | 610 | movq qword ptr [edx], xmm0 // write 12 pixels |
michael@0 | 611 | movhlps xmm1, xmm0 |
michael@0 | 612 | movd [edx + 8], xmm1 |
michael@0 | 613 | lea edx, [edx + 12] |
michael@0 | 614 | jg xloop |
michael@0 | 615 | |
michael@0 | 616 | ret |
michael@0 | 617 | } |
michael@0 | 618 | } |
michael@0 | 619 | |
michael@0 | 620 | // Scale 16x3 pixels to 6x1 with interpolation |
michael@0 | 621 | __declspec(naked) __declspec(align(16)) |
michael@0 | 622 | void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 623 | ptrdiff_t src_stride, |
michael@0 | 624 | uint8* dst_ptr, int dst_width) { |
michael@0 | 625 | __asm { |
michael@0 | 626 | push esi |
michael@0 | 627 | mov eax, [esp + 4 + 4] // src_ptr |
michael@0 | 628 | mov esi, [esp + 4 + 8] // src_stride |
michael@0 | 629 | mov edx, [esp + 4 + 12] // dst_ptr |
michael@0 | 630 | mov ecx, [esp + 4 + 16] // dst_width |
michael@0 | 631 | movdqa xmm2, kShufAc |
michael@0 | 632 | movdqa xmm3, kShufAc3 |
michael@0 | 633 | movdqa xmm4, kScaleAc33 |
michael@0 | 634 | pxor xmm5, xmm5 |
michael@0 | 635 | |
michael@0 | 636 | align 4 |
michael@0 | 637 | xloop: |
michael@0 | 638 | movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 |
michael@0 | 639 | movdqa xmm6, [eax + esi] |
michael@0 | 640 | movhlps xmm1, xmm0 |
michael@0 | 641 | movhlps xmm7, xmm6 |
michael@0 | 642 | punpcklbw xmm0, xmm5 |
michael@0 | 643 | punpcklbw xmm1, xmm5 |
michael@0 | 644 | punpcklbw xmm6, xmm5 |
michael@0 | 645 | punpcklbw xmm7, xmm5 |
michael@0 | 646 | paddusw xmm0, xmm6 |
michael@0 | 647 | paddusw xmm1, xmm7 |
michael@0 | 648 | movdqa xmm6, [eax + esi * 2] |
michael@0 | 649 | lea eax, [eax + 16] |
michael@0 | 650 | movhlps xmm7, xmm6 |
michael@0 | 651 | punpcklbw xmm6, xmm5 |
michael@0 | 652 | punpcklbw xmm7, xmm5 |
michael@0 | 653 | paddusw xmm0, xmm6 |
michael@0 | 654 | paddusw xmm1, xmm7 |
michael@0 | 655 | |
michael@0 | 656 | movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 |
michael@0 | 657 | psrldq xmm0, 2 |
michael@0 | 658 | paddusw xmm6, xmm0 |
michael@0 | 659 | psrldq xmm0, 2 |
michael@0 | 660 | paddusw xmm6, xmm0 |
michael@0 | 661 | pshufb xmm6, xmm2 |
michael@0 | 662 | |
michael@0 | 663 | movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 |
michael@0 | 664 | psrldq xmm1, 2 |
michael@0 | 665 | paddusw xmm7, xmm1 |
michael@0 | 666 | psrldq xmm1, 2 |
michael@0 | 667 | paddusw xmm7, xmm1 |
michael@0 | 668 | pshufb xmm7, xmm3 |
michael@0 | 669 | paddusw xmm6, xmm7 |
michael@0 | 670 | |
michael@0 | 671 | pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 |
michael@0 | 672 | packuswb xmm6, xmm6 |
michael@0 | 673 | |
michael@0 | 674 | sub ecx, 6 |
michael@0 | 675 | movd [edx], xmm6 // write 6 pixels |
michael@0 | 676 | psrlq xmm6, 16 |
michael@0 | 677 | movd [edx + 2], xmm6 |
michael@0 | 678 | lea edx, [edx + 6] |
michael@0 | 679 | jg xloop |
michael@0 | 680 | |
michael@0 | 681 | pop esi |
michael@0 | 682 | ret |
michael@0 | 683 | } |
michael@0 | 684 | } |
michael@0 | 685 | |
michael@0 | 686 | // Scale 16x2 pixels to 6x1 with interpolation |
michael@0 | 687 | __declspec(naked) __declspec(align(16)) |
michael@0 | 688 | void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 689 | ptrdiff_t src_stride, |
michael@0 | 690 | uint8* dst_ptr, int dst_width) { |
michael@0 | 691 | __asm { |
michael@0 | 692 | push esi |
michael@0 | 693 | mov eax, [esp + 4 + 4] // src_ptr |
michael@0 | 694 | mov esi, [esp + 4 + 8] // src_stride |
michael@0 | 695 | mov edx, [esp + 4 + 12] // dst_ptr |
michael@0 | 696 | mov ecx, [esp + 4 + 16] // dst_width |
michael@0 | 697 | movdqa xmm2, kShufAb0 |
michael@0 | 698 | movdqa xmm3, kShufAb1 |
michael@0 | 699 | movdqa xmm4, kShufAb2 |
michael@0 | 700 | movdqa xmm5, kScaleAb2 |
michael@0 | 701 | |
michael@0 | 702 | align 4 |
michael@0 | 703 | xloop: |
michael@0 | 704 | movdqa xmm0, [eax] // average 2 rows into xmm0 |
michael@0 | 705 | pavgb xmm0, [eax + esi] |
michael@0 | 706 | lea eax, [eax + 16] |
michael@0 | 707 | |
michael@0 | 708 | movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 |
michael@0 | 709 | pshufb xmm1, xmm2 |
michael@0 | 710 | movdqa xmm6, xmm0 |
michael@0 | 711 | pshufb xmm6, xmm3 |
michael@0 | 712 | paddusw xmm1, xmm6 |
michael@0 | 713 | pshufb xmm0, xmm4 |
michael@0 | 714 | paddusw xmm1, xmm0 |
michael@0 | 715 | |
michael@0 | 716 | pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 |
michael@0 | 717 | packuswb xmm1, xmm1 |
michael@0 | 718 | |
michael@0 | 719 | sub ecx, 6 |
michael@0 | 720 | movd [edx], xmm1 // write 6 pixels |
michael@0 | 721 | psrlq xmm1, 16 |
michael@0 | 722 | movd [edx + 2], xmm1 |
michael@0 | 723 | lea edx, [edx + 6] |
michael@0 | 724 | jg xloop |
michael@0 | 725 | |
michael@0 | 726 | pop esi |
michael@0 | 727 | ret |
michael@0 | 728 | } |
michael@0 | 729 | } |
michael@0 | 730 | |
michael@0 | 731 | // Reads 16xN bytes and produces 16 shorts at a time. |
michael@0 | 732 | // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. |
michael@0 | 733 | __declspec(naked) __declspec(align(16)) |
michael@0 | 734 | void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 735 | uint16* dst_ptr, int src_width, |
michael@0 | 736 | int src_height) { |
michael@0 | 737 | __asm { |
michael@0 | 738 | push esi |
michael@0 | 739 | push edi |
michael@0 | 740 | push ebx |
michael@0 | 741 | push ebp |
michael@0 | 742 | mov esi, [esp + 16 + 4] // src_ptr |
michael@0 | 743 | mov edx, [esp + 16 + 8] // src_stride |
michael@0 | 744 | mov edi, [esp + 16 + 12] // dst_ptr |
michael@0 | 745 | mov ecx, [esp + 16 + 16] // dst_width |
michael@0 | 746 | mov ebx, [esp + 16 + 20] // height |
michael@0 | 747 | pxor xmm4, xmm4 |
michael@0 | 748 | dec ebx |
michael@0 | 749 | |
michael@0 | 750 | align 4 |
michael@0 | 751 | xloop: |
michael@0 | 752 | // first row |
michael@0 | 753 | movdqa xmm0, [esi] |
michael@0 | 754 | lea eax, [esi + edx] |
michael@0 | 755 | movdqa xmm1, xmm0 |
michael@0 | 756 | punpcklbw xmm0, xmm4 |
michael@0 | 757 | punpckhbw xmm1, xmm4 |
michael@0 | 758 | lea esi, [esi + 16] |
michael@0 | 759 | mov ebp, ebx |
michael@0 | 760 | test ebp, ebp |
michael@0 | 761 | je ydone |
michael@0 | 762 | |
michael@0 | 763 | // sum remaining rows |
michael@0 | 764 | align 4 |
michael@0 | 765 | yloop: |
michael@0 | 766 | movdqa xmm2, [eax] // read 16 pixels |
michael@0 | 767 | lea eax, [eax + edx] // advance to next row |
michael@0 | 768 | movdqa xmm3, xmm2 |
michael@0 | 769 | punpcklbw xmm2, xmm4 |
michael@0 | 770 | punpckhbw xmm3, xmm4 |
michael@0 | 771 | paddusw xmm0, xmm2 // sum 16 words |
michael@0 | 772 | paddusw xmm1, xmm3 |
michael@0 | 773 | sub ebp, 1 |
michael@0 | 774 | jg yloop |
michael@0 | 775 | |
michael@0 | 776 | align 4 |
michael@0 | 777 | ydone: |
michael@0 | 778 | movdqa [edi], xmm0 |
michael@0 | 779 | movdqa [edi + 16], xmm1 |
michael@0 | 780 | lea edi, [edi + 32] |
michael@0 | 781 | |
michael@0 | 782 | sub ecx, 16 |
michael@0 | 783 | jg xloop |
michael@0 | 784 | |
michael@0 | 785 | pop ebp |
michael@0 | 786 | pop ebx |
michael@0 | 787 | pop edi |
michael@0 | 788 | pop esi |
michael@0 | 789 | ret |
michael@0 | 790 | } |
michael@0 | 791 | } |
michael@0 | 792 | |
michael@0 | 793 | // Bilinear column filtering. SSSE3 version. |
michael@0 | 794 | // TODO(fbarchard): Port to Neon |
michael@0 | 795 | // TODO(fbarchard): Switch the following: |
michael@0 | 796 | // xor ebx, ebx |
michael@0 | 797 | // mov bx, word ptr [esi + eax] // 2 source x0 pixels |
michael@0 | 798 | // To |
michael@0 | 799 | // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
michael@0 | 800 | // when drmemory bug fixed. |
michael@0 | 801 | // https://code.google.com/p/drmemory/issues/detail?id=1396 |
michael@0 | 802 | |
michael@0 | 803 | __declspec(naked) __declspec(align(16)) |
michael@0 | 804 | void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 805 | int dst_width, int x, int dx) { |
michael@0 | 806 | __asm { |
michael@0 | 807 | push ebx |
michael@0 | 808 | push esi |
michael@0 | 809 | push edi |
michael@0 | 810 | mov edi, [esp + 12 + 4] // dst_ptr |
michael@0 | 811 | mov esi, [esp + 12 + 8] // src_ptr |
michael@0 | 812 | mov ecx, [esp + 12 + 12] // dst_width |
michael@0 | 813 | movd xmm2, [esp + 12 + 16] // x |
michael@0 | 814 | movd xmm3, [esp + 12 + 20] // dx |
michael@0 | 815 | mov eax, 0x04040000 // shuffle to line up fractions with pixel. |
michael@0 | 816 | movd xmm5, eax |
michael@0 | 817 | pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
michael@0 | 818 | psrlw xmm6, 9 |
michael@0 | 819 | pextrw eax, xmm2, 1 // get x0 integer. preroll |
michael@0 | 820 | sub ecx, 2 |
michael@0 | 821 | jl xloop29 |
michael@0 | 822 | |
michael@0 | 823 | movdqa xmm0, xmm2 // x1 = x0 + dx |
michael@0 | 824 | paddd xmm0, xmm3 |
michael@0 | 825 | punpckldq xmm2, xmm0 // x0 x1 |
michael@0 | 826 | punpckldq xmm3, xmm3 // dx dx |
michael@0 | 827 | paddd xmm3, xmm3 // dx * 2, dx * 2 |
michael@0 | 828 | pextrw edx, xmm2, 3 // get x1 integer. preroll |
michael@0 | 829 | |
michael@0 | 830 | // 2 Pixel loop. |
michael@0 | 831 | align 4 |
michael@0 | 832 | xloop2: |
michael@0 | 833 | movdqa xmm1, xmm2 // x0, x1 fractions. |
michael@0 | 834 | paddd xmm2, xmm3 // x += dx |
michael@0 | 835 | movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
michael@0 | 836 | movd xmm0, ebx |
michael@0 | 837 | psrlw xmm1, 9 // 7 bit fractions. |
michael@0 | 838 | movzx ebx, word ptr [esi + edx] // 2 source x1 pixels |
michael@0 | 839 | movd xmm4, ebx |
michael@0 | 840 | pshufb xmm1, xmm5 // 0011 |
michael@0 | 841 | punpcklwd xmm0, xmm4 |
michael@0 | 842 | pxor xmm1, xmm6 // 0..7f and 7f..0 |
michael@0 | 843 | pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. |
michael@0 | 844 | pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
michael@0 | 845 | pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
michael@0 | 846 | psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
michael@0 | 847 | packuswb xmm0, xmm0 // 8 bits, 2 pixels. |
michael@0 | 848 | movd ebx, xmm0 |
michael@0 | 849 | mov [edi], bx |
michael@0 | 850 | lea edi, [edi + 2] |
michael@0 | 851 | sub ecx, 2 // 2 pixels |
michael@0 | 852 | jge xloop2 |
michael@0 | 853 | |
michael@0 | 854 | align 4 |
michael@0 | 855 | xloop29: |
michael@0 | 856 | |
michael@0 | 857 | add ecx, 2 - 1 |
michael@0 | 858 | jl xloop99 |
michael@0 | 859 | |
michael@0 | 860 | // 1 pixel remainder |
michael@0 | 861 | movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
michael@0 | 862 | movd xmm0, ebx |
michael@0 | 863 | psrlw xmm2, 9 // 7 bit fractions. |
michael@0 | 864 | pshufb xmm2, xmm5 // 0011 |
michael@0 | 865 | pxor xmm2, xmm6 // 0..7f and 7f..0 |
michael@0 | 866 | pmaddubsw xmm0, xmm2 // 16 bit |
michael@0 | 867 | psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
michael@0 | 868 | packuswb xmm0, xmm0 // 8 bits |
michael@0 | 869 | movd ebx, xmm0 |
michael@0 | 870 | mov [edi], bl |
michael@0 | 871 | |
michael@0 | 872 | align 4 |
michael@0 | 873 | xloop99: |
michael@0 | 874 | |
michael@0 | 875 | pop edi |
michael@0 | 876 | pop esi |
michael@0 | 877 | pop ebx |
michael@0 | 878 | ret |
michael@0 | 879 | } |
michael@0 | 880 | } |
michael@0 | 881 | |
michael@0 | 882 | // Reads 16 pixels, duplicates them and writes 32 pixels. |
michael@0 | 883 | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
michael@0 | 884 | __declspec(naked) __declspec(align(16)) |
michael@0 | 885 | void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 886 | int dst_width, int x, int dx) { |
michael@0 | 887 | __asm { |
michael@0 | 888 | mov edx, [esp + 4] // dst_ptr |
michael@0 | 889 | mov eax, [esp + 8] // src_ptr |
michael@0 | 890 | mov ecx, [esp + 12] // dst_width |
michael@0 | 891 | |
michael@0 | 892 | align 4 |
michael@0 | 893 | wloop: |
michael@0 | 894 | movdqa xmm0, [eax] |
michael@0 | 895 | lea eax, [eax + 16] |
michael@0 | 896 | movdqa xmm1, xmm0 |
michael@0 | 897 | punpcklbw xmm0, xmm0 |
michael@0 | 898 | punpckhbw xmm1, xmm1 |
michael@0 | 899 | sub ecx, 32 |
michael@0 | 900 | movdqa [edx], xmm0 |
michael@0 | 901 | movdqa [edx + 16], xmm1 |
michael@0 | 902 | lea edx, [edx + 32] |
michael@0 | 903 | jg wloop |
michael@0 | 904 | |
michael@0 | 905 | ret |
michael@0 | 906 | } |
michael@0 | 907 | } |
michael@0 | 908 | |
michael@0 | 909 | // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) |
michael@0 | 910 | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
michael@0 | 911 | __declspec(naked) __declspec(align(16)) |
michael@0 | 912 | void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
michael@0 | 913 | ptrdiff_t src_stride, |
michael@0 | 914 | uint8* dst_argb, int dst_width) { |
michael@0 | 915 | __asm { |
michael@0 | 916 | mov eax, [esp + 4] // src_argb |
michael@0 | 917 | // src_stride ignored |
michael@0 | 918 | mov edx, [esp + 12] // dst_argb |
michael@0 | 919 | mov ecx, [esp + 16] // dst_width |
michael@0 | 920 | |
michael@0 | 921 | align 4 |
michael@0 | 922 | wloop: |
michael@0 | 923 | movdqa xmm0, [eax] |
michael@0 | 924 | movdqa xmm1, [eax + 16] |
michael@0 | 925 | lea eax, [eax + 32] |
michael@0 | 926 | shufps xmm0, xmm1, 0xdd |
michael@0 | 927 | sub ecx, 4 |
michael@0 | 928 | movdqa [edx], xmm0 |
michael@0 | 929 | lea edx, [edx + 16] |
michael@0 | 930 | jg wloop |
michael@0 | 931 | |
michael@0 | 932 | ret |
michael@0 | 933 | } |
michael@0 | 934 | } |
michael@0 | 935 | |
michael@0 | 936 | // Blends 8x1 rectangle to 4x1. |
michael@0 | 937 | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
michael@0 | 938 | __declspec(naked) __declspec(align(16)) |
michael@0 | 939 | void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
michael@0 | 940 | ptrdiff_t src_stride, |
michael@0 | 941 | uint8* dst_argb, int dst_width) { |
michael@0 | 942 | __asm { |
michael@0 | 943 | mov eax, [esp + 4] // src_argb |
michael@0 | 944 | // src_stride ignored |
michael@0 | 945 | mov edx, [esp + 12] // dst_argb |
michael@0 | 946 | mov ecx, [esp + 16] // dst_width |
michael@0 | 947 | |
michael@0 | 948 | align 4 |
michael@0 | 949 | wloop: |
michael@0 | 950 | movdqa xmm0, [eax] |
michael@0 | 951 | movdqa xmm1, [eax + 16] |
michael@0 | 952 | lea eax, [eax + 32] |
michael@0 | 953 | movdqa xmm2, xmm0 |
michael@0 | 954 | shufps xmm0, xmm1, 0x88 // even pixels |
michael@0 | 955 | shufps xmm2, xmm1, 0xdd // odd pixels |
michael@0 | 956 | pavgb xmm0, xmm2 |
michael@0 | 957 | sub ecx, 4 |
michael@0 | 958 | movdqa [edx], xmm0 |
michael@0 | 959 | lea edx, [edx + 16] |
michael@0 | 960 | jg wloop |
michael@0 | 961 | |
michael@0 | 962 | ret |
michael@0 | 963 | } |
michael@0 | 964 | } |
michael@0 | 965 | |
michael@0 | 966 | // Blends 8x2 rectangle to 4x1. |
michael@0 | 967 | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
michael@0 | 968 | __declspec(naked) __declspec(align(16)) |
michael@0 | 969 | void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
michael@0 | 970 | ptrdiff_t src_stride, |
michael@0 | 971 | uint8* dst_argb, int dst_width) { |
michael@0 | 972 | __asm { |
michael@0 | 973 | push esi |
michael@0 | 974 | mov eax, [esp + 4 + 4] // src_argb |
michael@0 | 975 | mov esi, [esp + 4 + 8] // src_stride |
michael@0 | 976 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 977 | mov ecx, [esp + 4 + 16] // dst_width |
michael@0 | 978 | |
michael@0 | 979 | align 4 |
michael@0 | 980 | wloop: |
michael@0 | 981 | movdqa xmm0, [eax] |
michael@0 | 982 | movdqa xmm1, [eax + 16] |
michael@0 | 983 | movdqa xmm2, [eax + esi] |
michael@0 | 984 | movdqa xmm3, [eax + esi + 16] |
michael@0 | 985 | lea eax, [eax + 32] |
michael@0 | 986 | pavgb xmm0, xmm2 // average rows |
michael@0 | 987 | pavgb xmm1, xmm3 |
michael@0 | 988 | movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
michael@0 | 989 | shufps xmm0, xmm1, 0x88 // even pixels |
michael@0 | 990 | shufps xmm2, xmm1, 0xdd // odd pixels |
michael@0 | 991 | pavgb xmm0, xmm2 |
michael@0 | 992 | sub ecx, 4 |
michael@0 | 993 | movdqa [edx], xmm0 |
michael@0 | 994 | lea edx, [edx + 16] |
michael@0 | 995 | jg wloop |
michael@0 | 996 | |
michael@0 | 997 | pop esi |
michael@0 | 998 | ret |
michael@0 | 999 | } |
michael@0 | 1000 | } |
michael@0 | 1001 | |
michael@0 | 1002 | // Reads 4 pixels at a time. |
michael@0 | 1003 | // Alignment requirement: dst_argb 16 byte aligned. |
michael@0 | 1004 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1005 | void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
michael@0 | 1006 | int src_stepx, |
michael@0 | 1007 | uint8* dst_argb, int dst_width) { |
michael@0 | 1008 | __asm { |
michael@0 | 1009 | push ebx |
michael@0 | 1010 | push edi |
michael@0 | 1011 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1012 | // src_stride ignored |
michael@0 | 1013 | mov ebx, [esp + 8 + 12] // src_stepx |
michael@0 | 1014 | mov edx, [esp + 8 + 16] // dst_argb |
michael@0 | 1015 | mov ecx, [esp + 8 + 20] // dst_width |
michael@0 | 1016 | lea ebx, [ebx * 4] |
michael@0 | 1017 | lea edi, [ebx + ebx * 2] |
michael@0 | 1018 | |
michael@0 | 1019 | align 4 |
michael@0 | 1020 | wloop: |
michael@0 | 1021 | movd xmm0, [eax] |
michael@0 | 1022 | movd xmm1, [eax + ebx] |
michael@0 | 1023 | punpckldq xmm0, xmm1 |
michael@0 | 1024 | movd xmm2, [eax + ebx * 2] |
michael@0 | 1025 | movd xmm3, [eax + edi] |
michael@0 | 1026 | lea eax, [eax + ebx * 4] |
michael@0 | 1027 | punpckldq xmm2, xmm3 |
michael@0 | 1028 | punpcklqdq xmm0, xmm2 |
michael@0 | 1029 | sub ecx, 4 |
michael@0 | 1030 | movdqa [edx], xmm0 |
michael@0 | 1031 | lea edx, [edx + 16] |
michael@0 | 1032 | jg wloop |
michael@0 | 1033 | |
michael@0 | 1034 | pop edi |
michael@0 | 1035 | pop ebx |
michael@0 | 1036 | ret |
michael@0 | 1037 | } |
michael@0 | 1038 | } |
michael@0 | 1039 | |
michael@0 | 1040 | // Blends four 2x2 to 4x1. |
michael@0 | 1041 | // Alignment requirement: dst_argb 16 byte aligned. |
michael@0 | 1042 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1043 | void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
michael@0 | 1044 | ptrdiff_t src_stride, |
michael@0 | 1045 | int src_stepx, |
michael@0 | 1046 | uint8* dst_argb, int dst_width) { |
michael@0 | 1047 | __asm { |
michael@0 | 1048 | push ebx |
michael@0 | 1049 | push esi |
michael@0 | 1050 | push edi |
michael@0 | 1051 | mov eax, [esp + 12 + 4] // src_argb |
michael@0 | 1052 | mov esi, [esp + 12 + 8] // src_stride |
michael@0 | 1053 | mov ebx, [esp + 12 + 12] // src_stepx |
michael@0 | 1054 | mov edx, [esp + 12 + 16] // dst_argb |
michael@0 | 1055 | mov ecx, [esp + 12 + 20] // dst_width |
michael@0 | 1056 | lea esi, [eax + esi] // row1 pointer |
michael@0 | 1057 | lea ebx, [ebx * 4] |
michael@0 | 1058 | lea edi, [ebx + ebx * 2] |
michael@0 | 1059 | |
michael@0 | 1060 | align 4 |
michael@0 | 1061 | wloop: |
michael@0 | 1062 | movq xmm0, qword ptr [eax] // row0 4 pairs |
michael@0 | 1063 | movhps xmm0, qword ptr [eax + ebx] |
michael@0 | 1064 | movq xmm1, qword ptr [eax + ebx * 2] |
michael@0 | 1065 | movhps xmm1, qword ptr [eax + edi] |
michael@0 | 1066 | lea eax, [eax + ebx * 4] |
michael@0 | 1067 | movq xmm2, qword ptr [esi] // row1 4 pairs |
michael@0 | 1068 | movhps xmm2, qword ptr [esi + ebx] |
michael@0 | 1069 | movq xmm3, qword ptr [esi + ebx * 2] |
michael@0 | 1070 | movhps xmm3, qword ptr [esi + edi] |
michael@0 | 1071 | lea esi, [esi + ebx * 4] |
michael@0 | 1072 | pavgb xmm0, xmm2 // average rows |
michael@0 | 1073 | pavgb xmm1, xmm3 |
michael@0 | 1074 | movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
michael@0 | 1075 | shufps xmm0, xmm1, 0x88 // even pixels |
michael@0 | 1076 | shufps xmm2, xmm1, 0xdd // odd pixels |
michael@0 | 1077 | pavgb xmm0, xmm2 |
michael@0 | 1078 | sub ecx, 4 |
michael@0 | 1079 | movdqa [edx], xmm0 |
michael@0 | 1080 | lea edx, [edx + 16] |
michael@0 | 1081 | jg wloop |
michael@0 | 1082 | |
michael@0 | 1083 | pop edi |
michael@0 | 1084 | pop esi |
michael@0 | 1085 | pop ebx |
michael@0 | 1086 | ret |
michael@0 | 1087 | } |
michael@0 | 1088 | } |
michael@0 | 1089 | |
michael@0 | 1090 | // Column scaling unfiltered. SSE2 version. |
michael@0 | 1091 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1092 | void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
michael@0 | 1093 | int dst_width, int x, int dx) { |
michael@0 | 1094 | __asm { |
michael@0 | 1095 | push edi |
michael@0 | 1096 | push esi |
michael@0 | 1097 | mov edi, [esp + 8 + 4] // dst_argb |
michael@0 | 1098 | mov esi, [esp + 8 + 8] // src_argb |
michael@0 | 1099 | mov ecx, [esp + 8 + 12] // dst_width |
michael@0 | 1100 | movd xmm2, [esp + 8 + 16] // x |
michael@0 | 1101 | movd xmm3, [esp + 8 + 20] // dx |
michael@0 | 1102 | |
michael@0 | 1103 | pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 |
michael@0 | 1104 | pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 |
michael@0 | 1105 | paddd xmm2, xmm0 |
michael@0 | 1106 | paddd xmm3, xmm3 // 0, 0, 0, dx * 2 |
michael@0 | 1107 | pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 |
michael@0 | 1108 | paddd xmm2, xmm0 // x3 x2 x1 x0 |
michael@0 | 1109 | paddd xmm3, xmm3 // 0, 0, 0, dx * 4 |
michael@0 | 1110 | pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 |
michael@0 | 1111 | |
michael@0 | 1112 | pextrw eax, xmm2, 1 // get x0 integer. |
michael@0 | 1113 | pextrw edx, xmm2, 3 // get x1 integer. |
michael@0 | 1114 | |
michael@0 | 1115 | cmp ecx, 0 |
michael@0 | 1116 | jle xloop99 |
michael@0 | 1117 | sub ecx, 4 |
michael@0 | 1118 | jl xloop49 |
michael@0 | 1119 | |
michael@0 | 1120 | // 4 Pixel loop. |
michael@0 | 1121 | align 4 |
michael@0 | 1122 | xloop4: |
michael@0 | 1123 | movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
michael@0 | 1124 | movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
michael@0 | 1125 | pextrw eax, xmm2, 5 // get x2 integer. |
michael@0 | 1126 | pextrw edx, xmm2, 7 // get x3 integer. |
michael@0 | 1127 | paddd xmm2, xmm3 // x += dx |
michael@0 | 1128 | punpckldq xmm0, xmm1 // x0 x1 |
michael@0 | 1129 | |
michael@0 | 1130 | movd xmm1, [esi + eax * 4] // 1 source x2 pixels |
michael@0 | 1131 | movd xmm4, [esi + edx * 4] // 1 source x3 pixels |
michael@0 | 1132 | pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
michael@0 | 1133 | pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
michael@0 | 1134 | punpckldq xmm1, xmm4 // x2 x3 |
michael@0 | 1135 | punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 |
michael@0 | 1136 | sub ecx, 4 // 4 pixels |
michael@0 | 1137 | movdqu [edi], xmm0 |
michael@0 | 1138 | lea edi, [edi + 16] |
michael@0 | 1139 | jge xloop4 |
michael@0 | 1140 | |
michael@0 | 1141 | align 4 |
michael@0 | 1142 | xloop49: |
michael@0 | 1143 | test ecx, 2 |
michael@0 | 1144 | je xloop29 |
michael@0 | 1145 | |
michael@0 | 1146 | // 2 Pixels. |
michael@0 | 1147 | movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
michael@0 | 1148 | movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
michael@0 | 1149 | pextrw eax, xmm2, 5 // get x2 integer. |
michael@0 | 1150 | punpckldq xmm0, xmm1 // x0 x1 |
michael@0 | 1151 | |
michael@0 | 1152 | movq qword ptr [edi], xmm0 |
michael@0 | 1153 | lea edi, [edi + 8] |
michael@0 | 1154 | |
michael@0 | 1155 | xloop29: |
michael@0 | 1156 | test ecx, 1 |
michael@0 | 1157 | je xloop99 |
michael@0 | 1158 | |
michael@0 | 1159 | // 1 Pixels. |
michael@0 | 1160 | movd xmm0, [esi + eax * 4] // 1 source x2 pixels |
michael@0 | 1161 | movd dword ptr [edi], xmm0 |
michael@0 | 1162 | align 4 |
michael@0 | 1163 | xloop99: |
michael@0 | 1164 | |
michael@0 | 1165 | pop esi |
michael@0 | 1166 | pop edi |
michael@0 | 1167 | ret |
michael@0 | 1168 | } |
michael@0 | 1169 | } |
michael@0 | 1170 | |
michael@0 | 1171 | // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. |
michael@0 | 1172 | // TODO(fbarchard): Port to Neon |
michael@0 | 1173 | |
michael@0 | 1174 | // Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
michael@0 | 1175 | static uvec8 kShuffleColARGB = { |
michael@0 | 1176 | 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
michael@0 | 1177 | 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
michael@0 | 1178 | }; |
michael@0 | 1179 | |
michael@0 | 1180 | // Shuffle table for duplicating 2 fractions into 8 bytes each |
michael@0 | 1181 | static uvec8 kShuffleFractions = { |
michael@0 | 1182 | 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
michael@0 | 1183 | }; |
michael@0 | 1184 | |
michael@0 | 1185 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1186 | void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
michael@0 | 1187 | int dst_width, int x, int dx) { |
michael@0 | 1188 | __asm { |
michael@0 | 1189 | push esi |
michael@0 | 1190 | push edi |
michael@0 | 1191 | mov edi, [esp + 8 + 4] // dst_argb |
michael@0 | 1192 | mov esi, [esp + 8 + 8] // src_argb |
michael@0 | 1193 | mov ecx, [esp + 8 + 12] // dst_width |
michael@0 | 1194 | movd xmm2, [esp + 8 + 16] // x |
michael@0 | 1195 | movd xmm3, [esp + 8 + 20] // dx |
michael@0 | 1196 | movdqa xmm4, kShuffleColARGB |
michael@0 | 1197 | movdqa xmm5, kShuffleFractions |
michael@0 | 1198 | pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
michael@0 | 1199 | psrlw xmm6, 9 |
michael@0 | 1200 | pextrw eax, xmm2, 1 // get x0 integer. preroll |
michael@0 | 1201 | sub ecx, 2 |
michael@0 | 1202 | jl xloop29 |
michael@0 | 1203 | |
michael@0 | 1204 | movdqa xmm0, xmm2 // x1 = x0 + dx |
michael@0 | 1205 | paddd xmm0, xmm3 |
michael@0 | 1206 | punpckldq xmm2, xmm0 // x0 x1 |
michael@0 | 1207 | punpckldq xmm3, xmm3 // dx dx |
michael@0 | 1208 | paddd xmm3, xmm3 // dx * 2, dx * 2 |
michael@0 | 1209 | pextrw edx, xmm2, 3 // get x1 integer. preroll |
michael@0 | 1210 | |
michael@0 | 1211 | // 2 Pixel loop. |
michael@0 | 1212 | align 4 |
michael@0 | 1213 | xloop2: |
michael@0 | 1214 | movdqa xmm1, xmm2 // x0, x1 fractions. |
michael@0 | 1215 | paddd xmm2, xmm3 // x += dx |
michael@0 | 1216 | movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
michael@0 | 1217 | psrlw xmm1, 9 // 7 bit fractions. |
michael@0 | 1218 | movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels |
michael@0 | 1219 | pshufb xmm1, xmm5 // 0000000011111111 |
michael@0 | 1220 | pshufb xmm0, xmm4 // arrange pixels into pairs |
michael@0 | 1221 | pxor xmm1, xmm6 // 0..7f and 7f..0 |
michael@0 | 1222 | pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. |
michael@0 | 1223 | pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
michael@0 | 1224 | pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
michael@0 | 1225 | psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. |
michael@0 | 1226 | packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. |
michael@0 | 1227 | movq qword ptr [edi], xmm0 |
michael@0 | 1228 | lea edi, [edi + 8] |
michael@0 | 1229 | sub ecx, 2 // 2 pixels |
michael@0 | 1230 | jge xloop2 |
michael@0 | 1231 | |
michael@0 | 1232 | align 4 |
michael@0 | 1233 | xloop29: |
michael@0 | 1234 | |
michael@0 | 1235 | add ecx, 2 - 1 |
michael@0 | 1236 | jl xloop99 |
michael@0 | 1237 | |
michael@0 | 1238 | // 1 pixel remainder |
michael@0 | 1239 | psrlw xmm2, 9 // 7 bit fractions. |
michael@0 | 1240 | movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
michael@0 | 1241 | pshufb xmm2, xmm5 // 00000000 |
michael@0 | 1242 | pshufb xmm0, xmm4 // arrange pixels into pairs |
michael@0 | 1243 | pxor xmm2, xmm6 // 0..7f and 7f..0 |
michael@0 | 1244 | pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. |
michael@0 | 1245 | psrlw xmm0, 7 |
michael@0 | 1246 | packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. |
michael@0 | 1247 | movd [edi], xmm0 |
michael@0 | 1248 | |
michael@0 | 1249 | align 4 |
michael@0 | 1250 | xloop99: |
michael@0 | 1251 | |
michael@0 | 1252 | pop edi |
michael@0 | 1253 | pop esi |
michael@0 | 1254 | ret |
michael@0 | 1255 | } |
michael@0 | 1256 | } |
michael@0 | 1257 | |
michael@0 | 1258 | // Reads 4 pixels, duplicates them and writes 8 pixels. |
michael@0 | 1259 | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
michael@0 | 1260 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1261 | void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
michael@0 | 1262 | int dst_width, int x, int dx) { |
michael@0 | 1263 | __asm { |
michael@0 | 1264 | mov edx, [esp + 4] // dst_argb |
michael@0 | 1265 | mov eax, [esp + 8] // src_argb |
michael@0 | 1266 | mov ecx, [esp + 12] // dst_width |
michael@0 | 1267 | |
michael@0 | 1268 | align 4 |
michael@0 | 1269 | wloop: |
michael@0 | 1270 | movdqa xmm0, [eax] |
michael@0 | 1271 | lea eax, [eax + 16] |
michael@0 | 1272 | movdqa xmm1, xmm0 |
michael@0 | 1273 | punpckldq xmm0, xmm0 |
michael@0 | 1274 | punpckhdq xmm1, xmm1 |
michael@0 | 1275 | sub ecx, 8 |
michael@0 | 1276 | movdqa [edx], xmm0 |
michael@0 | 1277 | movdqa [edx + 16], xmm1 |
michael@0 | 1278 | lea edx, [edx + 32] |
michael@0 | 1279 | jg wloop |
michael@0 | 1280 | |
michael@0 | 1281 | ret |
michael@0 | 1282 | } |
michael@0 | 1283 | } |
michael@0 | 1284 | |
michael@0 | 1285 | // Divide num by div and return as 16.16 fixed point result. |
michael@0 | 1286 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1287 | int FixedDiv_X86(int num, int div) { |
michael@0 | 1288 | __asm { |
michael@0 | 1289 | mov eax, [esp + 4] // num |
michael@0 | 1290 | cdq // extend num to 64 bits |
michael@0 | 1291 | shld edx, eax, 16 // 32.16 |
michael@0 | 1292 | shl eax, 16 |
michael@0 | 1293 | idiv dword ptr [esp + 8] |
michael@0 | 1294 | ret |
michael@0 | 1295 | } |
michael@0 | 1296 | } |
michael@0 | 1297 | |
michael@0 | 1298 | // Divide num by div and return as 16.16 fixed point result. |
michael@0 | 1299 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1300 | int FixedDiv1_X86(int num, int div) { |
michael@0 | 1301 | __asm { |
michael@0 | 1302 | mov eax, [esp + 4] // num |
michael@0 | 1303 | mov ecx, [esp + 8] // denom |
michael@0 | 1304 | cdq // extend num to 64 bits |
michael@0 | 1305 | shld edx, eax, 16 // 32.16 |
michael@0 | 1306 | shl eax, 16 |
michael@0 | 1307 | sub eax, 0x00010001 |
michael@0 | 1308 | sbb edx, 0 |
michael@0 | 1309 | sub ecx, 1 |
michael@0 | 1310 | idiv ecx |
michael@0 | 1311 | ret |
michael@0 | 1312 | } |
michael@0 | 1313 | } |
michael@0 | 1314 | |
michael@0 | 1315 | #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
michael@0 | 1316 | |
michael@0 | 1317 | #ifdef __cplusplus |
michael@0 | 1318 | } // extern "C" |
michael@0 | 1319 | } // namespace libyuv |
michael@0 | 1320 | #endif |