media/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "third_party/x86inc/x86inc.asm"
michael@0 12
michael@0 13 SECTION .text
michael@0 14
michael@0 15 %macro SAD_FN 4
michael@0 16 %if %4 == 0
michael@0 17 %if %3 == 5
michael@0 18 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
michael@0 19 %else ; %3 == 7
michael@0 20 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
michael@0 21 src_stride3, ref_stride3, n_rows
michael@0 22 %endif ; %3 == 5/7
michael@0 23 %else ; avg
michael@0 24 %if %3 == 5
michael@0 25 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
michael@0 26 second_pred, n_rows
michael@0 27 %else ; %3 == 7
michael@0 28 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
michael@0 29 ref, ref_stride, \
michael@0 30 second_pred, \
michael@0 31 src_stride3, ref_stride3
michael@0 32 %if ARCH_X86_64
michael@0 33 %define n_rowsd r7d
michael@0 34 %else ; x86-32
michael@0 35 %define n_rowsd dword r0m
michael@0 36 %endif ; x86-32/64
michael@0 37 %endif ; %3 == 5/7
michael@0 38 %endif ; avg/sad
michael@0 39 movsxdifnidn src_strideq, src_strided
michael@0 40 movsxdifnidn ref_strideq, ref_strided
michael@0 41 %if %3 == 7
michael@0 42 lea src_stride3q, [src_strideq*3]
michael@0 43 lea ref_stride3q, [ref_strideq*3]
michael@0 44 %endif ; %3 == 7
michael@0 45 %endmacro
michael@0 46
michael@0 47 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
michael@0 48 ; uint8_t *ref, int ref_stride);
michael@0 49 %macro SAD64XN 1-2 0
michael@0 50 SAD_FN 64, %1, 5, %2
michael@0 51 mov n_rowsd, %1
michael@0 52 pxor m0, m0
michael@0 53 .loop:
michael@0 54 movu m1, [refq]
michael@0 55 movu m2, [refq+16]
michael@0 56 movu m3, [refq+32]
michael@0 57 movu m4, [refq+48]
michael@0 58 %if %2 == 1
michael@0 59 pavgb m1, [second_predq+mmsize*0]
michael@0 60 pavgb m2, [second_predq+mmsize*1]
michael@0 61 pavgb m3, [second_predq+mmsize*2]
michael@0 62 pavgb m4, [second_predq+mmsize*3]
michael@0 63 lea second_predq, [second_predq+mmsize*4]
michael@0 64 %endif
michael@0 65 psadbw m1, [srcq]
michael@0 66 psadbw m2, [srcq+16]
michael@0 67 psadbw m3, [srcq+32]
michael@0 68 psadbw m4, [srcq+48]
michael@0 69 paddd m1, m2
michael@0 70 paddd m3, m4
michael@0 71 add refq, ref_strideq
michael@0 72 paddd m0, m1
michael@0 73 add srcq, src_strideq
michael@0 74 paddd m0, m3
michael@0 75 dec n_rowsd
michael@0 76 jg .loop
michael@0 77
michael@0 78 movhlps m1, m0
michael@0 79 paddd m0, m1
michael@0 80 movd eax, m0
michael@0 81 RET
michael@0 82 %endmacro
michael@0 83
michael@0 84 INIT_XMM sse2
michael@0 85 SAD64XN 64 ; sad64x64_sse2
michael@0 86 SAD64XN 32 ; sad64x32_sse2
michael@0 87 SAD64XN 64, 1 ; sad64x64_avg_sse2
michael@0 88 SAD64XN 32, 1 ; sad64x32_avg_sse2
michael@0 89
michael@0 90 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
michael@0 91 ; uint8_t *ref, int ref_stride);
michael@0 92 %macro SAD32XN 1-2 0
michael@0 93 SAD_FN 32, %1, 5, %2
michael@0 94 mov n_rowsd, %1/2
michael@0 95 pxor m0, m0
michael@0 96 .loop:
michael@0 97 movu m1, [refq]
michael@0 98 movu m2, [refq+16]
michael@0 99 movu m3, [refq+ref_strideq]
michael@0 100 movu m4, [refq+ref_strideq+16]
michael@0 101 %if %2 == 1
michael@0 102 pavgb m1, [second_predq+mmsize*0]
michael@0 103 pavgb m2, [second_predq+mmsize*1]
michael@0 104 pavgb m3, [second_predq+mmsize*2]
michael@0 105 pavgb m4, [second_predq+mmsize*3]
michael@0 106 lea second_predq, [second_predq+mmsize*4]
michael@0 107 %endif
michael@0 108 psadbw m1, [srcq]
michael@0 109 psadbw m2, [srcq+16]
michael@0 110 psadbw m3, [srcq+src_strideq]
michael@0 111 psadbw m4, [srcq+src_strideq+16]
michael@0 112 paddd m1, m2
michael@0 113 paddd m3, m4
michael@0 114 lea refq, [refq+ref_strideq*2]
michael@0 115 paddd m0, m1
michael@0 116 lea srcq, [srcq+src_strideq*2]
michael@0 117 paddd m0, m3
michael@0 118 dec n_rowsd
michael@0 119 jg .loop
michael@0 120
michael@0 121 movhlps m1, m0
michael@0 122 paddd m0, m1
michael@0 123 movd eax, m0
michael@0 124 RET
michael@0 125 %endmacro
michael@0 126
michael@0 127 INIT_XMM sse2
michael@0 128 SAD32XN 64 ; sad32x64_sse2
michael@0 129 SAD32XN 32 ; sad32x32_sse2
michael@0 130 SAD32XN 16 ; sad32x16_sse2
michael@0 131 SAD32XN 64, 1 ; sad32x64_avg_sse2
michael@0 132 SAD32XN 32, 1 ; sad32x32_avg_sse2
michael@0 133 SAD32XN 16, 1 ; sad32x16_avg_sse2
michael@0 134
michael@0 135 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
michael@0 136 ; uint8_t *ref, int ref_stride);
michael@0 137 %macro SAD16XN 1-2 0
michael@0 138 SAD_FN 16, %1, 7, %2
michael@0 139 mov n_rowsd, %1/4
michael@0 140 pxor m0, m0
michael@0 141
michael@0 142 .loop:
michael@0 143 movu m1, [refq]
michael@0 144 movu m2, [refq+ref_strideq]
michael@0 145 movu m3, [refq+ref_strideq*2]
michael@0 146 movu m4, [refq+ref_stride3q]
michael@0 147 %if %2 == 1
michael@0 148 pavgb m1, [second_predq+mmsize*0]
michael@0 149 pavgb m2, [second_predq+mmsize*1]
michael@0 150 pavgb m3, [second_predq+mmsize*2]
michael@0 151 pavgb m4, [second_predq+mmsize*3]
michael@0 152 lea second_predq, [second_predq+mmsize*4]
michael@0 153 %endif
michael@0 154 psadbw m1, [srcq]
michael@0 155 psadbw m2, [srcq+src_strideq]
michael@0 156 psadbw m3, [srcq+src_strideq*2]
michael@0 157 psadbw m4, [srcq+src_stride3q]
michael@0 158 paddd m1, m2
michael@0 159 paddd m3, m4
michael@0 160 lea refq, [refq+ref_strideq*4]
michael@0 161 paddd m0, m1
michael@0 162 lea srcq, [srcq+src_strideq*4]
michael@0 163 paddd m0, m3
michael@0 164 dec n_rowsd
michael@0 165 jg .loop
michael@0 166
michael@0 167 movhlps m1, m0
michael@0 168 paddd m0, m1
michael@0 169 movd eax, m0
michael@0 170 RET
michael@0 171 %endmacro
michael@0 172
michael@0 173 INIT_XMM sse2
michael@0 174 SAD16XN 32 ; sad16x32_sse2
michael@0 175 SAD16XN 16 ; sad16x16_sse2
michael@0 176 SAD16XN 8 ; sad16x8_sse2
michael@0 177 SAD16XN 32, 1 ; sad16x32_avg_sse2
michael@0 178 SAD16XN 16, 1 ; sad16x16_avg_sse2
michael@0 179 SAD16XN 8, 1 ; sad16x8_avg_sse2
michael@0 180
michael@0 181 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
michael@0 182 ; uint8_t *ref, int ref_stride);
michael@0 183 %macro SAD8XN 1-2 0
michael@0 184 SAD_FN 8, %1, 7, %2
michael@0 185 mov n_rowsd, %1/4
michael@0 186 pxor m0, m0
michael@0 187
michael@0 188 .loop:
michael@0 189 movh m1, [refq]
michael@0 190 movhps m1, [refq+ref_strideq]
michael@0 191 movh m2, [refq+ref_strideq*2]
michael@0 192 movhps m2, [refq+ref_stride3q]
michael@0 193 %if %2 == 1
michael@0 194 pavgb m1, [second_predq+mmsize*0]
michael@0 195 pavgb m2, [second_predq+mmsize*1]
michael@0 196 lea second_predq, [second_predq+mmsize*2]
michael@0 197 %endif
michael@0 198 movh m3, [srcq]
michael@0 199 movhps m3, [srcq+src_strideq]
michael@0 200 movh m4, [srcq+src_strideq*2]
michael@0 201 movhps m4, [srcq+src_stride3q]
michael@0 202 psadbw m1, m3
michael@0 203 psadbw m2, m4
michael@0 204 lea refq, [refq+ref_strideq*4]
michael@0 205 paddd m0, m1
michael@0 206 lea srcq, [srcq+src_strideq*4]
michael@0 207 paddd m0, m2
michael@0 208 dec n_rowsd
michael@0 209 jg .loop
michael@0 210
michael@0 211 movhlps m1, m0
michael@0 212 paddd m0, m1
michael@0 213 movd eax, m0
michael@0 214 RET
michael@0 215 %endmacro
michael@0 216
michael@0 217 INIT_XMM sse2
michael@0 218 SAD8XN 16 ; sad8x16_sse2
michael@0 219 SAD8XN 8 ; sad8x8_sse2
michael@0 220 SAD8XN 4 ; sad8x4_sse2
michael@0 221 SAD8XN 16, 1 ; sad8x16_avg_sse2
michael@0 222 SAD8XN 8, 1 ; sad8x8_avg_sse2
michael@0 223 SAD8XN 4, 1 ; sad8x4_avg_sse2
michael@0 224
michael@0 225 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
michael@0 226 ; uint8_t *ref, int ref_stride);
michael@0 227 %macro SAD4XN 1-2 0
michael@0 228 SAD_FN 4, %1, 7, %2
michael@0 229 mov n_rowsd, %1/4
michael@0 230 pxor m0, m0
michael@0 231
michael@0 232 .loop:
michael@0 233 movd m1, [refq]
michael@0 234 movd m2, [refq+ref_strideq]
michael@0 235 movd m3, [refq+ref_strideq*2]
michael@0 236 movd m4, [refq+ref_stride3q]
michael@0 237 punpckldq m1, m2
michael@0 238 punpckldq m3, m4
michael@0 239 %if %2 == 1
michael@0 240 pavgb m1, [second_predq+mmsize*0]
michael@0 241 pavgb m3, [second_predq+mmsize*1]
michael@0 242 lea second_predq, [second_predq+mmsize*2]
michael@0 243 %endif
michael@0 244 movd m2, [srcq]
michael@0 245 movd m5, [srcq+src_strideq]
michael@0 246 movd m4, [srcq+src_strideq*2]
michael@0 247 movd m6, [srcq+src_stride3q]
michael@0 248 punpckldq m2, m5
michael@0 249 punpckldq m4, m6
michael@0 250 psadbw m1, m2
michael@0 251 psadbw m3, m4
michael@0 252 lea refq, [refq+ref_strideq*4]
michael@0 253 paddd m0, m1
michael@0 254 lea srcq, [srcq+src_strideq*4]
michael@0 255 paddd m0, m3
michael@0 256 dec n_rowsd
michael@0 257 jg .loop
michael@0 258
michael@0 259 movd eax, m0
michael@0 260 RET
michael@0 261 %endmacro
michael@0 262
michael@0 263 INIT_MMX sse
michael@0 264 SAD4XN 8 ; sad4x8_sse
michael@0 265 SAD4XN 4 ; sad4x4_sse
michael@0 266 SAD4XN 8, 1 ; sad4x8_avg_sse
michael@0 267 SAD4XN 4, 1 ; sad4x4_avg_sse

mercurial