security/nss/lib/freebl/mpi/mpi_sse2.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/mpi/mpi_sse2.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,294 @@
     1.4 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.7 +
     1.8 +#ifdef DARWIN
     1.9 +#define s_mpv_mul_d          _s_mpv_mul_d
    1.10 +#define s_mpv_mul_d_add      _s_mpv_mul_d_add
    1.11 +#define s_mpv_mul_d_add_prop _s_mpv_mul_d_add_prop
    1.12 +#define s_mpv_sqr_add_prop   _s_mpv_sqr_add_prop
    1.13 +#define s_mpv_div_2dx1d      _s_mpv_div_2dx1d
    1.14 +#define TYPE_FUNCTION(x)
    1.15 +#else
    1.16 +#define TYPE_FUNCTION(x) .type x, @function
    1.17 +#endif
    1.18 +
    1.19 +.text
    1.20 +
    1.21 + #  ebp - 8:    caller's esi
    1.22 + #  ebp - 4:    caller's edi
    1.23 + #  ebp + 0:    caller's ebp
    1.24 + #  ebp + 4:    return address
    1.25 + #  ebp + 8:    a       argument
    1.26 + #  ebp + 12:   a_len   argument
    1.27 + #  ebp + 16:   b       argument
    1.28 + #  ebp + 20:   c       argument
    1.29 + #  registers:
    1.30 + #      ebx:
    1.31 + #      ecx:    a_len
    1.32 + #      esi:    a ptr
    1.33 + #      edi:    c ptr
    1.34 +.globl s_mpv_mul_d
    1.35 +.private_extern s_mpv_mul_d
    1.36 +TYPE_FUNCTION(s_mpv_mul_d)
    1.37 +s_mpv_mul_d:
    1.38 +    push   %ebp
    1.39 +    mov    %esp, %ebp
    1.40 +    push   %edi
    1.41 +    push   %esi
    1.42 +    psubq  %mm2, %mm2           # carry = 0
    1.43 +    mov    12(%ebp), %ecx       # ecx = a_len
    1.44 +    movd   16(%ebp), %mm1       # mm1 = b
    1.45 +    mov    20(%ebp), %edi
    1.46 +    cmp    $0, %ecx
    1.47 +    je     2f                   # jmp if a_len == 0
    1.48 +    mov    8(%ebp), %esi        # esi = a
    1.49 +    cld
    1.50 +1:
    1.51 +    movd   0(%esi), %mm0        # mm0 = *a++
    1.52 +    add    $4, %esi
    1.53 +    pmuludq %mm1, %mm0          # mm0 = b * *a++
    1.54 +    paddq  %mm0, %mm2           # add the carry
    1.55 +    movd   %mm2, 0(%edi)        # store the 32bit result
    1.56 +    add    $4, %edi
    1.57 +    psrlq  $32, %mm2            # save the carry
    1.58 +    dec    %ecx                 # --a_len
    1.59 +    jnz    1b                   # jmp if a_len != 0
    1.60 +2:
    1.61 +    movd   %mm2, 0(%edi)        # *c = carry
    1.62 +    emms
    1.63 +    pop    %esi
    1.64 +    pop    %edi
    1.65 +    leave  
    1.66 +    ret    
    1.67 +    nop
    1.68 +
    1.69 + #  ebp - 8:    caller's esi
    1.70 + #  ebp - 4:    caller's edi
    1.71 + #  ebp + 0:    caller's ebp
    1.72 + #  ebp + 4:    return address
    1.73 + #  ebp + 8:    a       argument
    1.74 + #  ebp + 12:   a_len   argument
    1.75 + #  ebp + 16:   b       argument
    1.76 + #  ebp + 20:   c       argument
    1.77 + #  registers:
    1.78 + #      ebx:
    1.79 + #      ecx:    a_len
    1.80 + #      esi:    a ptr
    1.81 + #      edi:    c ptr
    1.82 +.globl s_mpv_mul_d_add
    1.83 +.private_extern s_mpv_mul_d_add
    1.84 +TYPE_FUNCTION(s_mpv_mul_d_add)
    1.85 +s_mpv_mul_d_add:
    1.86 +    push   %ebp
    1.87 +    mov    %esp, %ebp
    1.88 +    push   %edi
    1.89 +    push   %esi
    1.90 +    psubq  %mm2, %mm2           # carry = 0
    1.91 +    mov    12(%ebp), %ecx       # ecx = a_len
    1.92 +    movd   16(%ebp), %mm1       # mm1 = b
    1.93 +    mov    20(%ebp), %edi
    1.94 +    cmp    $0, %ecx
    1.95 +    je     2f                   # jmp if a_len == 0
    1.96 +    mov    8(%ebp), %esi        # esi = a
    1.97 +    cld
    1.98 +1:
    1.99 +    movd   0(%esi), %mm0        # mm0 = *a++
   1.100 +    add    $4, %esi
   1.101 +    pmuludq %mm1, %mm0          # mm0 = b * *a++
   1.102 +    paddq  %mm0, %mm2           # add the carry
   1.103 +    movd   0(%edi), %mm0
   1.104 +    paddq  %mm0, %mm2           # add the carry
   1.105 +    movd   %mm2, 0(%edi)        # store the 32bit result
   1.106 +    add    $4, %edi
   1.107 +    psrlq  $32, %mm2            # save the carry
   1.108 +    dec    %ecx                 # --a_len
   1.109 +    jnz    1b                   # jmp if a_len != 0
   1.110 +2:
   1.111 +    movd   %mm2, 0(%edi)        # *c = carry
   1.112 +    emms
   1.113 +    pop    %esi
   1.114 +    pop    %edi
   1.115 +    leave  
   1.116 +    ret    
   1.117 +    nop
   1.118 +
   1.119 + #  ebp - 12:   caller's ebx
   1.120 + #  ebp - 8:    caller's esi
   1.121 + #  ebp - 4:    caller's edi
   1.122 + #  ebp + 0:    caller's ebp
   1.123 + #  ebp + 4:    return address
   1.124 + #  ebp + 8:    a       argument
   1.125 + #  ebp + 12:   a_len   argument
   1.126 + #  ebp + 16:   b       argument
   1.127 + #  ebp + 20:   c       argument
   1.128 + #  registers:
   1.129 + #      eax:
   1.130 + #      ebx:    carry
   1.131 + #      ecx:    a_len
   1.132 + #      esi:    a ptr
   1.133 + #      edi:    c ptr
   1.134 +.globl s_mpv_mul_d_add_prop
   1.135 +.private_extern s_mpv_mul_d_add_prop
   1.136 +TYPE_FUNCTION(s_mpv_mul_d_add_prop)
   1.137 +s_mpv_mul_d_add_prop:
   1.138 +    push   %ebp
   1.139 +    mov    %esp, %ebp
   1.140 +    push   %edi
   1.141 +    push   %esi
   1.142 +    push   %ebx
   1.143 +    psubq  %mm2, %mm2           # carry = 0
   1.144 +    mov    12(%ebp), %ecx       # ecx = a_len
   1.145 +    movd   16(%ebp), %mm1       # mm1 = b
   1.146 +    mov    20(%ebp), %edi
   1.147 +    cmp    $0, %ecx
   1.148 +    je     2f                   # jmp if a_len == 0
   1.149 +    mov    8(%ebp), %esi        # esi = a
   1.150 +    cld
   1.151 +1:
   1.152 +    movd   0(%esi), %mm0        # mm0 = *a++
   1.153 +    movd   0(%edi), %mm3        # fetch the sum
   1.154 +    add    $4, %esi
   1.155 +    pmuludq %mm1, %mm0          # mm0 = b * *a++
   1.156 +    paddq  %mm0, %mm2           # add the carry
   1.157 +    paddq  %mm3, %mm2           # add *c++
   1.158 +    movd   %mm2, 0(%edi)        # store the 32bit result
   1.159 +    add    $4, %edi
   1.160 +    psrlq  $32, %mm2            # save the carry
   1.161 +    dec    %ecx                 # --a_len
   1.162 +    jnz    1b                   # jmp if a_len != 0
   1.163 +2:
   1.164 +    movd   %mm2, %ebx
   1.165 +    cmp    $0, %ebx             # is carry zero?
   1.166 +    jz     4f
   1.167 +    mov    0(%edi), %eax
   1.168 +    add    %ebx, %eax
   1.169 +    stosl
   1.170 +    jnc    4f
   1.171 +3:
   1.172 +    mov    0(%edi), %eax        # add in current word from *c
   1.173 +    adc    $0, %eax
   1.174 +    stosl                       # [es:edi] = ax; edi += 4;
   1.175 +    jc     3b
   1.176 +4:
   1.177 +    emms
   1.178 +    pop    %ebx
   1.179 +    pop    %esi
   1.180 +    pop    %edi
   1.181 +    leave  
   1.182 +    ret    
   1.183 +    nop
   1.184 +
   1.185 + #  ebp - 12:   caller's ebx
   1.186 + #  ebp - 8:    caller's esi
   1.187 + #  ebp - 4:    caller's edi
   1.188 + #  ebp + 0:    caller's ebp
   1.189 + #  ebp + 4:    return address
   1.190 + #  ebp + 8:    pa      argument
   1.191 + #  ebp + 12:   a_len   argument
   1.192 + #  ebp + 16:   ps      argument
   1.193 + #  registers:
   1.194 + #      eax:
   1.195 + #      ebx:    carry
   1.196 + #      ecx:    a_len
   1.197 + #      esi:    a ptr
   1.198 + #      edi:    c ptr
   1.199 +.globl s_mpv_sqr_add_prop
   1.200 +.private_extern s_mpv_sqr_add_prop
   1.201 +TYPE_FUNCTION(s_mpv_sqr_add_prop)
   1.202 +s_mpv_sqr_add_prop:
   1.203 +    push   %ebp
   1.204 +    mov    %esp, %ebp
   1.205 +    push   %edi
   1.206 +    push   %esi
   1.207 +    push   %ebx
   1.208 +    psubq  %mm2, %mm2           # carry = 0
   1.209 +    mov    12(%ebp), %ecx       # ecx = a_len
   1.210 +    mov    16(%ebp), %edi
   1.211 +    cmp    $0, %ecx
   1.212 +    je     2f                   # jmp if a_len == 0
   1.213 +    mov    8(%ebp), %esi        # esi = a
   1.214 +    cld
   1.215 +1:
   1.216 +    movd   0(%esi), %mm0        # mm0 = *a
   1.217 +    movd   0(%edi), %mm3        # fetch the sum
   1.218 +    add    $4, %esi
   1.219 +    pmuludq %mm0, %mm0          # mm0 = sqr(a)
   1.220 +    paddq  %mm0, %mm2           # add the carry
   1.221 +    paddq  %mm3, %mm2           # add the low word
   1.222 +    movd   4(%edi), %mm3
   1.223 +    movd   %mm2, 0(%edi)        # store the 32bit result
   1.224 +    psrlq  $32, %mm2
   1.225 +    paddq  %mm3, %mm2           # add the high word
   1.226 +    movd   %mm2, 4(%edi)        # store the 32bit result
   1.227 +    psrlq  $32, %mm2            # save the carry.
   1.228 +    add    $8, %edi
   1.229 +    dec    %ecx                 # --a_len
   1.230 +    jnz    1b                   # jmp if a_len != 0
   1.231 +2:
   1.232 +    movd   %mm2, %ebx
   1.233 +    cmp    $0, %ebx             # is carry zero?
   1.234 +    jz     4f
   1.235 +    mov    0(%edi), %eax
   1.236 +    add    %ebx, %eax
   1.237 +    stosl
   1.238 +    jnc    4f
   1.239 +3:
   1.240 +    mov    0(%edi), %eax        # add in current word from *c
   1.241 +    adc    $0, %eax
   1.242 +    stosl                       #  [es:edi] = ax; edi += 4;
   1.243 +    jc     3b
   1.244 +4:
   1.245 +    emms
   1.246 +    pop    %ebx
   1.247 +    pop    %esi
   1.248 +    pop    %edi
   1.249 +    leave  
   1.250 +    ret    
   1.251 +    nop
   1.252 +
   1.253 + #
   1.254 + # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
   1.255 + # so its high bit is 1.   This code is from NSPR.
   1.256 + #
   1.257 + # mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
   1.258 + #                        mp_digit *qp, mp_digit *rp)
   1.259 +
   1.260 + #  esp +  0:   Caller's ebx
   1.261 + #  esp +  4:   return address
   1.262 + #  esp +  8:   Nhi     argument
   1.263 + #  esp + 12:   Nlo     argument
   1.264 + #  esp + 16:   divisor argument
   1.265 + #  esp + 20:   qp      argument
   1.266 + #  esp + 24:   rp      argument
   1.267 + #  registers:
   1.268 + #      eax:
   1.269 + #      ebx:    carry
   1.270 + #      ecx:    a_len
   1.271 + #      edx:
   1.272 + #      esi:    a ptr
   1.273 + #      edi:    c ptr
   1.274 + # 
   1.275 +.globl s_mpv_div_2dx1d
   1.276 +.private_extern s_mpv_div_2dx1d
   1.277 +TYPE_FUNCTION(s_mpv_div_2dx1d)
   1.278 +s_mpv_div_2dx1d:
   1.279 +       push   %ebx
   1.280 +       mov    8(%esp), %edx
   1.281 +       mov    12(%esp), %eax
   1.282 +       mov    16(%esp), %ebx
   1.283 +       div    %ebx
   1.284 +       mov    20(%esp), %ebx
   1.285 +       mov    %eax, 0(%ebx)
   1.286 +       mov    24(%esp), %ebx
   1.287 +       mov    %edx, 0(%ebx)
   1.288 +       xor    %eax, %eax        # return zero
   1.289 +       pop    %ebx
   1.290 +       ret    
   1.291 +       nop
   1.292 +
   1.293 +#ifndef DARWIN
   1.294 + # Magic indicating no need for an executable stack
   1.295 +.section .note.GNU-stack, "", @progbits
   1.296 +.previous
   1.297 +#endif

mercurial