media/libvpx/vp8/encoder/x86/dct_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/x86/dct_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,432 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +%macro STACK_FRAME_CREATE 0
    1.18 +%if ABI_IS_32BIT
    1.19 +  %define       input       rsi
    1.20 +  %define       output      rdi
    1.21 +  %define       pitch       rax
    1.22 +    push        rbp
    1.23 +    mov         rbp, rsp
    1.24 +    GET_GOT     rbx
    1.25 +    push        rsi
    1.26 +    push        rdi
    1.27 +    ; end prolog
    1.28 +
    1.29 +    mov         rsi, arg(0)
    1.30 +    mov         rdi, arg(1)
    1.31 +
    1.32 +    movsxd      rax, dword ptr arg(2)
    1.33 +    lea         rcx, [rsi + rax*2]
    1.34 +%else
    1.35 +  %if LIBVPX_YASM_WIN64
    1.36 +    %define     input       rcx
    1.37 +    %define     output      rdx
    1.38 +    %define     pitch       r8
    1.39 +    SAVE_XMM 7, u
    1.40 +  %else
    1.41 +    %define     input       rdi
    1.42 +    %define     output      rsi
    1.43 +    %define     pitch       rdx
    1.44 +  %endif
    1.45 +%endif
    1.46 +%endmacro
    1.47 +
    1.48 +%macro STACK_FRAME_DESTROY 0
    1.49 +  %define     input
    1.50 +  %define     output
    1.51 +  %define     pitch
    1.52 +
    1.53 +%if ABI_IS_32BIT
    1.54 +    pop         rdi
    1.55 +    pop         rsi
    1.56 +    RESTORE_GOT
    1.57 +    pop         rbp
    1.58 +%else
    1.59 +  %if LIBVPX_YASM_WIN64
    1.60 +    RESTORE_XMM
    1.61 +  %endif
    1.62 +%endif
    1.63 +    ret
    1.64 +%endmacro
    1.65 +
    1.66 +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
    1.67 +global sym(vp8_short_fdct4x4_sse2) PRIVATE
    1.68 +sym(vp8_short_fdct4x4_sse2):
    1.69 +
    1.70 +    STACK_FRAME_CREATE
    1.71 +
    1.72 +    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
    1.73 +    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
    1.74 +    lea         input,          [input+2*pitch]
    1.75 +    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
    1.76 +    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
    1.77 +
    1.78 +    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
    1.79 +    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
    1.80 +
    1.81 +    movdqa      xmm2, xmm0
    1.82 +    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
    1.83 +    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
    1.84 +    movdqa      xmm1, xmm0
    1.85 +    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
    1.86 +    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
    1.87 +    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
    1.88 +
    1.89 +    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
    1.90 +    movdqa      xmm3, xmm0
    1.91 +    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
    1.92 +    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
    1.93 +    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
    1.94 +    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
    1.95 +
    1.96 +    movdqa      xmm1, xmm0
    1.97 +    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
    1.98 +    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
    1.99 +    movdqa      xmm4, xmm3
   1.100 +    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
   1.101 +    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
   1.102 +
   1.103 +    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
   1.104 +    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
   1.105 +    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
   1.106 +    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
   1.107 +
   1.108 +    packssdw    xmm0, xmm1                      ;op[2] op[0]
   1.109 +    packssdw    xmm3, xmm4                      ;op[3] op[1]
   1.110 +    ; 23 22 21 20 03 02 01 00
   1.111 +    ;
   1.112 +    ; 33 32 31 30 13 12 11 10
   1.113 +    ;
   1.114 +    movdqa      xmm2, xmm0
   1.115 +    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
   1.116 +    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
   1.117 +
   1.118 +    movdqa      xmm3, xmm0
   1.119 +    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
   1.120 +    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
   1.121 +    movdqa      xmm2, xmm0
   1.122 +    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
   1.123 +    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
   1.124 +
   1.125 +    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
   1.126 +    pshufd      xmm2, xmm2, 04eh
   1.127 +    movdqa      xmm3, xmm0
   1.128 +    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
   1.129 +    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
   1.130 +
   1.131 +    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
   1.132 +    movdqa      xmm2, xmm3                      ;save d1 for compare
   1.133 +    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
   1.134 +    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
   1.135 +    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
   1.136 +    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
   1.137 +    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
   1.138 +    movdqa      xmm1, xmm0
   1.139 +    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
   1.140 +    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
   1.141 +
   1.142 +    pxor        xmm4, xmm4                      ;zero out for compare
   1.143 +    paddd       xmm0, xmm5
   1.144 +    paddd       xmm1, xmm5
   1.145 +    pcmpeqw     xmm2, xmm4
   1.146 +    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
   1.147 +    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
   1.148 +    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
   1.149 +                                                     ;and keep bit 0 of lower
   1.150 +
   1.151 +    movdqa      xmm4, xmm3
   1.152 +    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
   1.153 +    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
   1.154 +    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
   1.155 +    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
   1.156 +    packssdw    xmm0, xmm1                      ;op[8] op[0]
   1.157 +    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
   1.158 +    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
   1.159 +
   1.160 +    packssdw    xmm3, xmm4                      ;op[12] op[4]
   1.161 +    movdqa      xmm1, xmm0
   1.162 +    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
   1.163 +    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
   1.164 +    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
   1.165 +
   1.166 +    movdqa      XMMWORD PTR[output +  0], xmm0
   1.167 +    movdqa      XMMWORD PTR[output + 16], xmm1
   1.168 +
   1.169 +    STACK_FRAME_DESTROY
   1.170 +
   1.171 +;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
   1.172 +global sym(vp8_short_fdct8x4_sse2) PRIVATE
   1.173 +sym(vp8_short_fdct8x4_sse2):
   1.174 +
   1.175 +    STACK_FRAME_CREATE
   1.176 +
   1.177 +        ; read the input data
   1.178 +        movdqa      xmm0,       [input        ]
   1.179 +        movdqa      xmm2,       [input+  pitch]
   1.180 +        lea         input,      [input+2*pitch]
   1.181 +        movdqa      xmm4,       [input        ]
   1.182 +        movdqa      xmm3,       [input+  pitch]
   1.183 +
   1.184 +        ; transpose for the first stage
   1.185 +        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
   1.186 +        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
   1.187 +
   1.188 +        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
   1.189 +        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
   1.190 +
   1.191 +        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
   1.192 +        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
   1.193 +
   1.194 +        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
   1.195 +        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
   1.196 +
   1.197 +        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
   1.198 +
   1.199 +        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
   1.200 +        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
   1.201 +
   1.202 +        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
   1.203 +        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
   1.204 +
   1.205 +        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
   1.206 +        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
   1.207 +
   1.208 +        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
   1.209 +        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
   1.210 +
   1.211 +        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
   1.212 +
   1.213 +        ; xmm0 0
   1.214 +        ; xmm1 1
   1.215 +        ; xmm2 2
   1.216 +        ; xmm3 3
   1.217 +
   1.218 +        ; first stage
   1.219 +        movdqa      xmm5,       xmm0
   1.220 +        movdqa      xmm4,       xmm1
   1.221 +
   1.222 +        paddw       xmm0,       xmm3        ; a1 = 0 + 3
   1.223 +        paddw       xmm1,       xmm2        ; b1 = 1 + 2
   1.224 +
   1.225 +        psubw       xmm4,       xmm2        ; c1 = 1 - 2
   1.226 +        psubw       xmm5,       xmm3        ; d1 = 0 - 3
   1.227 +
   1.228 +        psllw       xmm5,        3
   1.229 +        psllw       xmm4,        3
   1.230 +
   1.231 +        psllw       xmm0,        3
   1.232 +        psllw       xmm1,        3
   1.233 +
   1.234 +        ; output 0 and 2
   1.235 +        movdqa      xmm2,       xmm0        ; a1
   1.236 +
   1.237 +        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
   1.238 +        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
   1.239 +
   1.240 +        ; output 1 and 3
   1.241 +        ; interleave c1, d1
   1.242 +        movdqa      xmm1,       xmm5        ; d1
   1.243 +        punpcklwd   xmm1,       xmm4        ; c1 d1
   1.244 +        punpckhwd   xmm5,       xmm4        ; c1 d1
   1.245 +
   1.246 +        movdqa      xmm3,       xmm1
   1.247 +        movdqa      xmm4,       xmm5
   1.248 +
   1.249 +        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   1.250 +        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   1.251 +
   1.252 +        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.253 +        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.254 +
   1.255 +        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
   1.256 +        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
   1.257 +        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
   1.258 +        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
   1.259 +
   1.260 +        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   1.261 +        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   1.262 +        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   1.263 +        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   1.264 +
   1.265 +        packssdw    xmm1,       xmm4        ; op[1]
   1.266 +        packssdw    xmm3,       xmm5        ; op[3]
   1.267 +
   1.268 +        ; done with vertical
   1.269 +        ; transpose for the second stage
   1.270 +        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
   1.271 +        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
   1.272 +
   1.273 +        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
   1.274 +        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
   1.275 +
   1.276 +        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
   1.277 +        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
   1.278 +
   1.279 +        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
   1.280 +        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
   1.281 +
   1.282 +        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
   1.283 +
   1.284 +        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
   1.285 +        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
   1.286 +
   1.287 +        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
   1.288 +        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
   1.289 +
   1.290 +        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
   1.291 +        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
   1.292 +
   1.293 +        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
   1.294 +        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
   1.295 +
   1.296 +        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
   1.297 +
   1.298 +        ; xmm0 0
   1.299 +        ; xmm1 4
   1.300 +        ; xmm2 1
   1.301 +        ; xmm3 3
   1.302 +
   1.303 +        movdqa      xmm5,       xmm0
   1.304 +        movdqa      xmm2,       xmm1
   1.305 +
   1.306 +        paddw       xmm0,       xmm3        ; a1 = 0 + 3
   1.307 +        paddw       xmm1,       xmm4        ; b1 = 1 + 2
   1.308 +
   1.309 +        psubw       xmm4,       xmm2        ; c1 = 1 - 2
   1.310 +        psubw       xmm5,       xmm3        ; d1 = 0 - 3
   1.311 +
   1.312 +        pxor        xmm6,       xmm6        ; zero out for compare
   1.313 +
   1.314 +        pcmpeqw     xmm6,       xmm5        ; d1 != 0
   1.315 +
   1.316 +        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
   1.317 +                                                                    ; and keep bit 0 of lower
   1.318 +
   1.319 +        ; output 0 and 2
   1.320 +        movdqa      xmm2,       xmm0        ; a1
   1.321 +
   1.322 +        paddw       xmm0,       xmm1        ; a1 + b1
   1.323 +        psubw       xmm2,       xmm1        ; a1 - b1
   1.324 +
   1.325 +        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
   1.326 +        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
   1.327 +
   1.328 +        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
   1.329 +        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
   1.330 +
   1.331 +        ; output 1 and 3
   1.332 +        ; interleave c1, d1
   1.333 +        movdqa      xmm1,       xmm5        ; d1
   1.334 +        punpcklwd   xmm1,       xmm4        ; c1 d1
   1.335 +        punpckhwd   xmm5,       xmm4        ; c1 d1
   1.336 +
   1.337 +        movdqa      xmm3,       xmm1
   1.338 +        movdqa      xmm4,       xmm5
   1.339 +
   1.340 +        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   1.341 +        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   1.342 +
   1.343 +        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.344 +        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.345 +
   1.346 +        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
   1.347 +        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
   1.348 +        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
   1.349 +        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
   1.350 +
   1.351 +        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   1.352 +        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   1.353 +        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   1.354 +        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   1.355 +
   1.356 +        packssdw    xmm1,       xmm4        ; op[4]
   1.357 +        packssdw    xmm3,       xmm5        ; op[12]
   1.358 +
   1.359 +        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
   1.360 +
   1.361 +        movdqa      xmm4,       xmm0
   1.362 +        movdqa      xmm5,       xmm2
   1.363 +
   1.364 +        punpcklqdq  xmm0,       xmm1
   1.365 +        punpckhqdq  xmm4,       xmm1
   1.366 +
   1.367 +        punpcklqdq  xmm2,       xmm3
   1.368 +        punpckhqdq  xmm5,       xmm3
   1.369 +
   1.370 +        movdqa      XMMWORD PTR[output + 0 ],  xmm0
   1.371 +        movdqa      XMMWORD PTR[output + 16],  xmm2
   1.372 +        movdqa      XMMWORD PTR[output + 32],  xmm4
   1.373 +        movdqa      XMMWORD PTR[output + 48],  xmm5
   1.374 +
   1.375 +    STACK_FRAME_DESTROY
   1.376 +
   1.377 +SECTION_RODATA
   1.378 +align 16
   1.379 +_5352_2217:
   1.380 +    dw 5352
   1.381 +    dw 2217
   1.382 +    dw 5352
   1.383 +    dw 2217
   1.384 +    dw 5352
   1.385 +    dw 2217
   1.386 +    dw 5352
   1.387 +    dw 2217
   1.388 +align 16
   1.389 +_2217_neg5352:
   1.390 +    dw 2217
   1.391 +    dw -5352
   1.392 +    dw 2217
   1.393 +    dw -5352
   1.394 +    dw 2217
   1.395 +    dw -5352
   1.396 +    dw 2217
   1.397 +    dw -5352
   1.398 +align 16
   1.399 +_mult_add:
   1.400 +    times 8 dw 1
   1.401 +align 16
   1.402 +_cmp_mask:
   1.403 +    times 4 dw 1
   1.404 +    times 4 dw 0
   1.405 +align 16
   1.406 +_cmp_mask8x4:
   1.407 +    times 8 dw 1
   1.408 +align 16
   1.409 +_mult_sub:
   1.410 +    dw 1
   1.411 +    dw -1
   1.412 +    dw 1
   1.413 +    dw -1
   1.414 +    dw 1
   1.415 +    dw -1
   1.416 +    dw 1
   1.417 +    dw -1
   1.418 +align 16
   1.419 +_7:
   1.420 +    times 4 dd 7
   1.421 +align 16
   1.422 +_7w:
   1.423 +    times 8 dw 7
   1.424 +align 16
   1.425 +_14500:
   1.426 +    times 4 dd 14500
   1.427 +align 16
   1.428 +_7500:
   1.429 +    times 4 dd 7500
   1.430 +align 16
   1.431 +_12000:
   1.432 +    times 4 dd 12000
   1.433 +align 16
   1.434 +_51000:
   1.435 +    times 4 dd 51000

mercurial