Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | ; |
michael@0 | 2 | ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
michael@0 | 3 | ; |
michael@0 | 4 | ; Use of this source code is governed by a BSD-style license |
michael@0 | 5 | ; that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | ; tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | ; in the file PATENTS. All contributing project authors may |
michael@0 | 8 | ; be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | ; |
michael@0 | 10 | |
michael@0 | 11 | %include "third_party/x86inc/x86inc.asm" |
michael@0 | 12 | |
michael@0 | 13 | SECTION .text |
michael@0 | 14 | |
michael@0 | 15 | %macro SAD_FN 4 |
michael@0 | 16 | %if %4 == 0 |
michael@0 | 17 | %if %3 == 5 |
michael@0 | 18 | cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows |
michael@0 | 19 | %else ; %3 == 7 |
michael@0 | 20 | cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ |
michael@0 | 21 | src_stride3, ref_stride3, n_rows |
michael@0 | 22 | %endif ; %3 == 5/7 |
michael@0 | 23 | %else ; avg |
michael@0 | 24 | %if %3 == 5 |
michael@0 | 25 | cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ |
michael@0 | 26 | second_pred, n_rows |
michael@0 | 27 | %else ; %3 == 7 |
michael@0 | 28 | cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ |
michael@0 | 29 | ref, ref_stride, \ |
michael@0 | 30 | second_pred, \ |
michael@0 | 31 | src_stride3, ref_stride3 |
michael@0 | 32 | %if ARCH_X86_64 |
michael@0 | 33 | %define n_rowsd r7d |
michael@0 | 34 | %else ; x86-32 |
michael@0 | 35 | %define n_rowsd dword r0m |
michael@0 | 36 | %endif ; x86-32/64 |
michael@0 | 37 | %endif ; %3 == 5/7 |
michael@0 | 38 | %endif ; avg/sad |
michael@0 | 39 | movsxdifnidn src_strideq, src_strided |
michael@0 | 40 | movsxdifnidn ref_strideq, ref_strided |
michael@0 | 41 | %if %3 == 7 |
michael@0 | 42 | lea src_stride3q, [src_strideq*3] |
michael@0 | 43 | lea ref_stride3q, [ref_strideq*3] |
michael@0 | 44 | %endif ; %3 == 7 |
michael@0 | 45 | %endmacro |
michael@0 | 46 | |
michael@0 | 47 | ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, |
michael@0 | 48 | ; uint8_t *ref, int ref_stride); |
michael@0 | 49 | %macro SAD64XN 1-2 0 |
michael@0 | 50 | SAD_FN 64, %1, 5, %2 |
michael@0 | 51 | mov n_rowsd, %1 |
michael@0 | 52 | pxor m0, m0 |
michael@0 | 53 | .loop: |
michael@0 | 54 | movu m1, [refq] |
michael@0 | 55 | movu m2, [refq+16] |
michael@0 | 56 | movu m3, [refq+32] |
michael@0 | 57 | movu m4, [refq+48] |
michael@0 | 58 | %if %2 == 1 |
michael@0 | 59 | pavgb m1, [second_predq+mmsize*0] |
michael@0 | 60 | pavgb m2, [second_predq+mmsize*1] |
michael@0 | 61 | pavgb m3, [second_predq+mmsize*2] |
michael@0 | 62 | pavgb m4, [second_predq+mmsize*3] |
michael@0 | 63 | lea second_predq, [second_predq+mmsize*4] |
michael@0 | 64 | %endif |
michael@0 | 65 | psadbw m1, [srcq] |
michael@0 | 66 | psadbw m2, [srcq+16] |
michael@0 | 67 | psadbw m3, [srcq+32] |
michael@0 | 68 | psadbw m4, [srcq+48] |
michael@0 | 69 | paddd m1, m2 |
michael@0 | 70 | paddd m3, m4 |
michael@0 | 71 | add refq, ref_strideq |
michael@0 | 72 | paddd m0, m1 |
michael@0 | 73 | add srcq, src_strideq |
michael@0 | 74 | paddd m0, m3 |
michael@0 | 75 | dec n_rowsd |
michael@0 | 76 | jg .loop |
michael@0 | 77 | |
michael@0 | 78 | movhlps m1, m0 |
michael@0 | 79 | paddd m0, m1 |
michael@0 | 80 | movd eax, m0 |
michael@0 | 81 | RET |
michael@0 | 82 | %endmacro |
michael@0 | 83 | |
michael@0 | 84 | INIT_XMM sse2 |
michael@0 | 85 | SAD64XN 64 ; sad64x64_sse2 |
michael@0 | 86 | SAD64XN 32 ; sad64x32_sse2 |
michael@0 | 87 | SAD64XN 64, 1 ; sad64x64_avg_sse2 |
michael@0 | 88 | SAD64XN 32, 1 ; sad64x32_avg_sse2 |
michael@0 | 89 | |
michael@0 | 90 | ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, |
michael@0 | 91 | ; uint8_t *ref, int ref_stride); |
michael@0 | 92 | %macro SAD32XN 1-2 0 |
michael@0 | 93 | SAD_FN 32, %1, 5, %2 |
michael@0 | 94 | mov n_rowsd, %1/2 |
michael@0 | 95 | pxor m0, m0 |
michael@0 | 96 | .loop: |
michael@0 | 97 | movu m1, [refq] |
michael@0 | 98 | movu m2, [refq+16] |
michael@0 | 99 | movu m3, [refq+ref_strideq] |
michael@0 | 100 | movu m4, [refq+ref_strideq+16] |
michael@0 | 101 | %if %2 == 1 |
michael@0 | 102 | pavgb m1, [second_predq+mmsize*0] |
michael@0 | 103 | pavgb m2, [second_predq+mmsize*1] |
michael@0 | 104 | pavgb m3, [second_predq+mmsize*2] |
michael@0 | 105 | pavgb m4, [second_predq+mmsize*3] |
michael@0 | 106 | lea second_predq, [second_predq+mmsize*4] |
michael@0 | 107 | %endif |
michael@0 | 108 | psadbw m1, [srcq] |
michael@0 | 109 | psadbw m2, [srcq+16] |
michael@0 | 110 | psadbw m3, [srcq+src_strideq] |
michael@0 | 111 | psadbw m4, [srcq+src_strideq+16] |
michael@0 | 112 | paddd m1, m2 |
michael@0 | 113 | paddd m3, m4 |
michael@0 | 114 | lea refq, [refq+ref_strideq*2] |
michael@0 | 115 | paddd m0, m1 |
michael@0 | 116 | lea srcq, [srcq+src_strideq*2] |
michael@0 | 117 | paddd m0, m3 |
michael@0 | 118 | dec n_rowsd |
michael@0 | 119 | jg .loop |
michael@0 | 120 | |
michael@0 | 121 | movhlps m1, m0 |
michael@0 | 122 | paddd m0, m1 |
michael@0 | 123 | movd eax, m0 |
michael@0 | 124 | RET |
michael@0 | 125 | %endmacro |
michael@0 | 126 | |
michael@0 | 127 | INIT_XMM sse2 |
michael@0 | 128 | SAD32XN 64 ; sad32x64_sse2 |
michael@0 | 129 | SAD32XN 32 ; sad32x32_sse2 |
michael@0 | 130 | SAD32XN 16 ; sad32x16_sse2 |
michael@0 | 131 | SAD32XN 64, 1 ; sad32x64_avg_sse2 |
michael@0 | 132 | SAD32XN 32, 1 ; sad32x32_avg_sse2 |
michael@0 | 133 | SAD32XN 16, 1 ; sad32x16_avg_sse2 |
michael@0 | 134 | |
michael@0 | 135 | ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, |
michael@0 | 136 | ; uint8_t *ref, int ref_stride); |
michael@0 | 137 | %macro SAD16XN 1-2 0 |
michael@0 | 138 | SAD_FN 16, %1, 7, %2 |
michael@0 | 139 | mov n_rowsd, %1/4 |
michael@0 | 140 | pxor m0, m0 |
michael@0 | 141 | |
michael@0 | 142 | .loop: |
michael@0 | 143 | movu m1, [refq] |
michael@0 | 144 | movu m2, [refq+ref_strideq] |
michael@0 | 145 | movu m3, [refq+ref_strideq*2] |
michael@0 | 146 | movu m4, [refq+ref_stride3q] |
michael@0 | 147 | %if %2 == 1 |
michael@0 | 148 | pavgb m1, [second_predq+mmsize*0] |
michael@0 | 149 | pavgb m2, [second_predq+mmsize*1] |
michael@0 | 150 | pavgb m3, [second_predq+mmsize*2] |
michael@0 | 151 | pavgb m4, [second_predq+mmsize*3] |
michael@0 | 152 | lea second_predq, [second_predq+mmsize*4] |
michael@0 | 153 | %endif |
michael@0 | 154 | psadbw m1, [srcq] |
michael@0 | 155 | psadbw m2, [srcq+src_strideq] |
michael@0 | 156 | psadbw m3, [srcq+src_strideq*2] |
michael@0 | 157 | psadbw m4, [srcq+src_stride3q] |
michael@0 | 158 | paddd m1, m2 |
michael@0 | 159 | paddd m3, m4 |
michael@0 | 160 | lea refq, [refq+ref_strideq*4] |
michael@0 | 161 | paddd m0, m1 |
michael@0 | 162 | lea srcq, [srcq+src_strideq*4] |
michael@0 | 163 | paddd m0, m3 |
michael@0 | 164 | dec n_rowsd |
michael@0 | 165 | jg .loop |
michael@0 | 166 | |
michael@0 | 167 | movhlps m1, m0 |
michael@0 | 168 | paddd m0, m1 |
michael@0 | 169 | movd eax, m0 |
michael@0 | 170 | RET |
michael@0 | 171 | %endmacro |
michael@0 | 172 | |
michael@0 | 173 | INIT_XMM sse2 |
michael@0 | 174 | SAD16XN 32 ; sad16x32_sse2 |
michael@0 | 175 | SAD16XN 16 ; sad16x16_sse2 |
michael@0 | 176 | SAD16XN 8 ; sad16x8_sse2 |
michael@0 | 177 | SAD16XN 32, 1 ; sad16x32_avg_sse2 |
michael@0 | 178 | SAD16XN 16, 1 ; sad16x16_avg_sse2 |
michael@0 | 179 | SAD16XN 8, 1 ; sad16x8_avg_sse2 |
michael@0 | 180 | |
michael@0 | 181 | ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, |
michael@0 | 182 | ; uint8_t *ref, int ref_stride); |
michael@0 | 183 | %macro SAD8XN 1-2 0 |
michael@0 | 184 | SAD_FN 8, %1, 7, %2 |
michael@0 | 185 | mov n_rowsd, %1/4 |
michael@0 | 186 | pxor m0, m0 |
michael@0 | 187 | |
michael@0 | 188 | .loop: |
michael@0 | 189 | movh m1, [refq] |
michael@0 | 190 | movhps m1, [refq+ref_strideq] |
michael@0 | 191 | movh m2, [refq+ref_strideq*2] |
michael@0 | 192 | movhps m2, [refq+ref_stride3q] |
michael@0 | 193 | %if %2 == 1 |
michael@0 | 194 | pavgb m1, [second_predq+mmsize*0] |
michael@0 | 195 | pavgb m2, [second_predq+mmsize*1] |
michael@0 | 196 | lea second_predq, [second_predq+mmsize*2] |
michael@0 | 197 | %endif |
michael@0 | 198 | movh m3, [srcq] |
michael@0 | 199 | movhps m3, [srcq+src_strideq] |
michael@0 | 200 | movh m4, [srcq+src_strideq*2] |
michael@0 | 201 | movhps m4, [srcq+src_stride3q] |
michael@0 | 202 | psadbw m1, m3 |
michael@0 | 203 | psadbw m2, m4 |
michael@0 | 204 | lea refq, [refq+ref_strideq*4] |
michael@0 | 205 | paddd m0, m1 |
michael@0 | 206 | lea srcq, [srcq+src_strideq*4] |
michael@0 | 207 | paddd m0, m2 |
michael@0 | 208 | dec n_rowsd |
michael@0 | 209 | jg .loop |
michael@0 | 210 | |
michael@0 | 211 | movhlps m1, m0 |
michael@0 | 212 | paddd m0, m1 |
michael@0 | 213 | movd eax, m0 |
michael@0 | 214 | RET |
michael@0 | 215 | %endmacro |
michael@0 | 216 | |
michael@0 | 217 | INIT_XMM sse2 |
michael@0 | 218 | SAD8XN 16 ; sad8x16_sse2 |
michael@0 | 219 | SAD8XN 8 ; sad8x8_sse2 |
michael@0 | 220 | SAD8XN 4 ; sad8x4_sse2 |
michael@0 | 221 | SAD8XN 16, 1 ; sad8x16_avg_sse2 |
michael@0 | 222 | SAD8XN 8, 1 ; sad8x8_avg_sse2 |
michael@0 | 223 | SAD8XN 4, 1 ; sad8x4_avg_sse2 |
michael@0 | 224 | |
michael@0 | 225 | ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, |
michael@0 | 226 | ; uint8_t *ref, int ref_stride); |
michael@0 | 227 | %macro SAD4XN 1-2 0 |
michael@0 | 228 | SAD_FN 4, %1, 7, %2 |
michael@0 | 229 | mov n_rowsd, %1/4 |
michael@0 | 230 | pxor m0, m0 |
michael@0 | 231 | |
michael@0 | 232 | .loop: |
michael@0 | 233 | movd m1, [refq] |
michael@0 | 234 | movd m2, [refq+ref_strideq] |
michael@0 | 235 | movd m3, [refq+ref_strideq*2] |
michael@0 | 236 | movd m4, [refq+ref_stride3q] |
michael@0 | 237 | punpckldq m1, m2 |
michael@0 | 238 | punpckldq m3, m4 |
michael@0 | 239 | %if %2 == 1 |
michael@0 | 240 | pavgb m1, [second_predq+mmsize*0] |
michael@0 | 241 | pavgb m3, [second_predq+mmsize*1] |
michael@0 | 242 | lea second_predq, [second_predq+mmsize*2] |
michael@0 | 243 | %endif |
michael@0 | 244 | movd m2, [srcq] |
michael@0 | 245 | movd m5, [srcq+src_strideq] |
michael@0 | 246 | movd m4, [srcq+src_strideq*2] |
michael@0 | 247 | movd m6, [srcq+src_stride3q] |
michael@0 | 248 | punpckldq m2, m5 |
michael@0 | 249 | punpckldq m4, m6 |
michael@0 | 250 | psadbw m1, m2 |
michael@0 | 251 | psadbw m3, m4 |
michael@0 | 252 | lea refq, [refq+ref_strideq*4] |
michael@0 | 253 | paddd m0, m1 |
michael@0 | 254 | lea srcq, [srcq+src_strideq*4] |
michael@0 | 255 | paddd m0, m3 |
michael@0 | 256 | dec n_rowsd |
michael@0 | 257 | jg .loop |
michael@0 | 258 | |
michael@0 | 259 | movd eax, m0 |
michael@0 | 260 | RET |
michael@0 | 261 | %endmacro |
michael@0 | 262 | |
michael@0 | 263 | INIT_MMX sse |
michael@0 | 264 | SAD4XN 8 ; sad4x8_sse |
michael@0 | 265 | SAD4XN 4 ; sad4x4_sse |
michael@0 | 266 | SAD4XN 8, 1 ; sad4x8_avg_sse |
michael@0 | 267 | SAD4XN 4, 1 ; sad4x4_avg_sse |