1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpi_amd64_sun.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,385 @@ 1.4 +/ This Source Code Form is subject to the terms of the Mozilla Public 1.5 +/ License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +/ file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 + 1.9 +/ ------------------------------------------------------------------------ 1.10 +/ 1.11 +/ Implementation of s_mpv_mul_set_vec which exploits 1.12 +/ the 64X64->128 bit unsigned multiply instruction. 1.13 +/ 1.14 +/ ------------------------------------------------------------------------ 1.15 + 1.16 +/ r = a * digit, r and a are vectors of length len 1.17 +/ returns the carry digit 1.18 +/ r and a are 64 bit aligned. 1.19 +/ 1.20 +/ uint64_t 1.21 +/ s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 1.22 +/ 1.23 + 1.24 +.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: 1.25 + 1.26 + xorq %rax, %rax / if (len == 0) return (0) 1.27 + testq %rdx, %rdx 1.28 + jz .L17 1.29 + 1.30 + movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 1.31 + xorq %r9, %r9 / cy = 0 1.32 + 1.33 +.L15: 1.34 + cmpq $8, %r8 / 8 - len 1.35 + jb .L16 1.36 + movq 0(%rsi), %rax / rax = a[0] 1.37 + movq 8(%rsi), %r11 / prefetch a[1] 1.38 + mulq %rcx / p = a[0] * digit 1.39 + addq %r9, %rax 1.40 + adcq $0, %rdx / p += cy 1.41 + movq %rax, 0(%rdi) / r[0] = lo(p) 1.42 + movq %rdx, %r9 / cy = hi(p) 1.43 + 1.44 + movq %r11, %rax 1.45 + movq 16(%rsi), %r11 / prefetch a[2] 1.46 + mulq %rcx / p = a[1] * digit 1.47 + addq %r9, %rax 1.48 + adcq $0, %rdx / p += cy 1.49 + movq %rax, 8(%rdi) / r[1] = lo(p) 1.50 + movq %rdx, %r9 / cy = hi(p) 1.51 + 1.52 + movq %r11, %rax 1.53 + movq 24(%rsi), %r11 / prefetch a[3] 1.54 + mulq %rcx / p = a[2] * digit 1.55 + addq %r9, %rax 1.56 + adcq $0, %rdx / p += cy 1.57 + movq %rax, 16(%rdi) / r[2] = lo(p) 1.58 + movq %rdx, %r9 / cy = hi(p) 1.59 + 1.60 + movq %r11, %rax 1.61 + movq 32(%rsi), %r11 / prefetch a[4] 1.62 + mulq %rcx / p = a[3] * digit 1.63 + addq %r9, %rax 1.64 + adcq $0, %rdx / p += cy 1.65 + movq %rax, 24(%rdi) / r[3] = lo(p) 1.66 + movq %rdx, %r9 / cy = hi(p) 1.67 + 1.68 + movq %r11, %rax 1.69 + movq 40(%rsi), %r11 / prefetch a[5] 1.70 + mulq %rcx / p = a[4] * digit 1.71 + addq %r9, %rax 1.72 + adcq $0, %rdx / p += cy 1.73 + movq %rax, 32(%rdi) / r[4] = lo(p) 1.74 + movq %rdx, %r9 / cy = hi(p) 1.75 + 1.76 + movq %r11, %rax 1.77 + movq 48(%rsi), %r11 / prefetch a[6] 1.78 + mulq %rcx / p = a[5] * digit 1.79 + addq %r9, %rax 1.80 + adcq $0, %rdx / p += cy 1.81 + movq %rax, 40(%rdi) / r[5] = lo(p) 1.82 + movq %rdx, %r9 / cy = hi(p) 1.83 + 1.84 + movq %r11, %rax 1.85 + movq 56(%rsi), %r11 / prefetch a[7] 1.86 + mulq %rcx / p = a[6] * digit 1.87 + addq %r9, %rax 1.88 + adcq $0, %rdx / p += cy 1.89 + movq %rax, 48(%rdi) / r[6] = lo(p) 1.90 + movq %rdx, %r9 / cy = hi(p) 1.91 + 1.92 + movq %r11, %rax 1.93 + mulq %rcx / p = a[7] * digit 1.94 + addq %r9, %rax 1.95 + adcq $0, %rdx / p += cy 1.96 + movq %rax, 56(%rdi) / r[7] = lo(p) 1.97 + movq %rdx, %r9 / cy = hi(p) 1.98 + 1.99 + addq $64, %rsi 1.100 + addq $64, %rdi 1.101 + subq $8, %r8 1.102 + 1.103 + jz .L17 1.104 + jmp .L15 1.105 + 1.106 +.L16: 1.107 + movq 0(%rsi), %rax 1.108 + mulq %rcx / p = a[0] * digit 1.109 + addq %r9, %rax 1.110 + adcq $0, %rdx / p += cy 1.111 + movq %rax, 0(%rdi) / r[0] = lo(p) 1.112 + movq %rdx, %r9 / cy = hi(p) 1.113 + decq %r8 1.114 + jz .L17 1.115 + 1.116 + movq 8(%rsi), %rax 1.117 + mulq %rcx / p = a[1] * digit 1.118 + addq %r9, %rax 1.119 + adcq $0, %rdx / p += cy 1.120 + movq %rax, 8(%rdi) / r[1] = lo(p) 1.121 + movq %rdx, %r9 / cy = hi(p) 1.122 + decq %r8 1.123 + jz .L17 1.124 + 1.125 + movq 16(%rsi), %rax 1.126 + mulq %rcx / p = a[2] * digit 1.127 + addq %r9, %rax 1.128 + adcq $0, %rdx / p += cy 1.129 + movq %rax, 16(%rdi) / r[2] = lo(p) 1.130 + movq %rdx, %r9 / cy = hi(p) 1.131 + decq %r8 1.132 + jz .L17 1.133 + 1.134 + movq 24(%rsi), %rax 1.135 + mulq %rcx / p = a[3] * digit 1.136 + addq %r9, %rax 1.137 + adcq $0, %rdx / p += cy 1.138 + movq %rax, 24(%rdi) / r[3] = lo(p) 1.139 + movq %rdx, %r9 / cy = hi(p) 1.140 + decq %r8 1.141 + jz .L17 1.142 + 1.143 + movq 32(%rsi), %rax 1.144 + mulq %rcx / p = a[4] * digit 1.145 + addq %r9, %rax 1.146 + adcq $0, %rdx / p += cy 1.147 + movq %rax, 32(%rdi) / r[4] = lo(p) 1.148 + movq %rdx, %r9 / cy = hi(p) 1.149 + decq %r8 1.150 + jz .L17 1.151 + 1.152 + movq 40(%rsi), %rax 1.153 + mulq %rcx / p = a[5] * digit 1.154 + addq %r9, %rax 1.155 + adcq $0, %rdx / p += cy 1.156 + movq %rax, 40(%rdi) / r[5] = lo(p) 1.157 + movq %rdx, %r9 / cy = hi(p) 1.158 + decq %r8 1.159 + jz .L17 1.160 + 1.161 + movq 48(%rsi), %rax 1.162 + mulq %rcx / p = a[6] * digit 1.163 + addq %r9, %rax 1.164 + adcq $0, %rdx / p += cy 1.165 + movq %rax, 48(%rdi) / r[6] = lo(p) 1.166 + movq %rdx, %r9 / cy = hi(p) 1.167 + decq %r8 1.168 + jz .L17 1.169 + 1.170 + 1.171 +.L17: 1.172 + movq %r9, %rax 1.173 + ret 1.174 + 1.175 +.size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64 1.176 + 1.177 +/ ------------------------------------------------------------------------ 1.178 +/ 1.179 +/ Implementation of s_mpv_mul_add_vec which exploits 1.180 +/ the 64X64->128 bit unsigned multiply instruction. 1.181 +/ 1.182 +/ ------------------------------------------------------------------------ 1.183 + 1.184 +/ r += a * digit, r and a are vectors of length len 1.185 +/ returns the carry digit 1.186 +/ r and a are 64 bit aligned. 1.187 +/ 1.188 +/ uint64_t 1.189 +/ s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 1.190 +/ 1.191 + 1.192 +.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: 1.193 + 1.194 + xorq %rax, %rax / if (len == 0) return (0) 1.195 + testq %rdx, %rdx 1.196 + jz .L27 1.197 + 1.198 + movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 1.199 + xorq %r9, %r9 / cy = 0 1.200 + 1.201 +.L25: 1.202 + cmpq $8, %r8 / 8 - len 1.203 + jb .L26 1.204 + movq 0(%rsi), %rax / rax = a[0] 1.205 + movq 0(%rdi), %r10 / r10 = r[0] 1.206 + movq 8(%rsi), %r11 / prefetch a[1] 1.207 + mulq %rcx / p = a[0] * digit 1.208 + addq %r10, %rax 1.209 + adcq $0, %rdx / p += r[0] 1.210 + movq 8(%rdi), %r10 / prefetch r[1] 1.211 + addq %r9, %rax 1.212 + adcq $0, %rdx / p += cy 1.213 + movq %rax, 0(%rdi) / r[0] = lo(p) 1.214 + movq %rdx, %r9 / cy = hi(p) 1.215 + 1.216 + movq %r11, %rax 1.217 + movq 16(%rsi), %r11 / prefetch a[2] 1.218 + mulq %rcx / p = a[1] * digit 1.219 + addq %r10, %rax 1.220 + adcq $0, %rdx / p += r[1] 1.221 + movq 16(%rdi), %r10 / prefetch r[2] 1.222 + addq %r9, %rax 1.223 + adcq $0, %rdx / p += cy 1.224 + movq %rax, 8(%rdi) / r[1] = lo(p) 1.225 + movq %rdx, %r9 / cy = hi(p) 1.226 + 1.227 + movq %r11, %rax 1.228 + movq 24(%rsi), %r11 / prefetch a[3] 1.229 + mulq %rcx / p = a[2] * digit 1.230 + addq %r10, %rax 1.231 + adcq $0, %rdx / p += r[2] 1.232 + movq 24(%rdi), %r10 / prefetch r[3] 1.233 + addq %r9, %rax 1.234 + adcq $0, %rdx / p += cy 1.235 + movq %rax, 16(%rdi) / r[2] = lo(p) 1.236 + movq %rdx, %r9 / cy = hi(p) 1.237 + 1.238 + movq %r11, %rax 1.239 + movq 32(%rsi), %r11 / prefetch a[4] 1.240 + mulq %rcx / p = a[3] * digit 1.241 + addq %r10, %rax 1.242 + adcq $0, %rdx / p += r[3] 1.243 + movq 32(%rdi), %r10 / prefetch r[4] 1.244 + addq %r9, %rax 1.245 + adcq $0, %rdx / p += cy 1.246 + movq %rax, 24(%rdi) / r[3] = lo(p) 1.247 + movq %rdx, %r9 / cy = hi(p) 1.248 + 1.249 + movq %r11, %rax 1.250 + movq 40(%rsi), %r11 / prefetch a[5] 1.251 + mulq %rcx / p = a[4] * digit 1.252 + addq %r10, %rax 1.253 + adcq $0, %rdx / p += r[4] 1.254 + movq 40(%rdi), %r10 / prefetch r[5] 1.255 + addq %r9, %rax 1.256 + adcq $0, %rdx / p += cy 1.257 + movq %rax, 32(%rdi) / r[4] = lo(p) 1.258 + movq %rdx, %r9 / cy = hi(p) 1.259 + 1.260 + movq %r11, %rax 1.261 + movq 48(%rsi), %r11 / prefetch a[6] 1.262 + mulq %rcx / p = a[5] * digit 1.263 + addq %r10, %rax 1.264 + adcq $0, %rdx / p += r[5] 1.265 + movq 48(%rdi), %r10 / prefetch r[6] 1.266 + addq %r9, %rax 1.267 + adcq $0, %rdx / p += cy 1.268 + movq %rax, 40(%rdi) / r[5] = lo(p) 1.269 + movq %rdx, %r9 / cy = hi(p) 1.270 + 1.271 + movq %r11, %rax 1.272 + movq 56(%rsi), %r11 / prefetch a[7] 1.273 + mulq %rcx / p = a[6] * digit 1.274 + addq %r10, %rax 1.275 + adcq $0, %rdx / p += r[6] 1.276 + movq 56(%rdi), %r10 / prefetch r[7] 1.277 + addq %r9, %rax 1.278 + adcq $0, %rdx / p += cy 1.279 + movq %rax, 48(%rdi) / r[6] = lo(p) 1.280 + movq %rdx, %r9 / cy = hi(p) 1.281 + 1.282 + movq %r11, %rax 1.283 + mulq %rcx / p = a[7] * digit 1.284 + addq %r10, %rax 1.285 + adcq $0, %rdx / p += r[7] 1.286 + addq %r9, %rax 1.287 + adcq $0, %rdx / p += cy 1.288 + movq %rax, 56(%rdi) / r[7] = lo(p) 1.289 + movq %rdx, %r9 / cy = hi(p) 1.290 + 1.291 + addq $64, %rsi 1.292 + addq $64, %rdi 1.293 + subq $8, %r8 1.294 + 1.295 + jz .L27 1.296 + jmp .L25 1.297 + 1.298 +.L26: 1.299 + movq 0(%rsi), %rax 1.300 + movq 0(%rdi), %r10 1.301 + mulq %rcx / p = a[0] * digit 1.302 + addq %r10, %rax 1.303 + adcq $0, %rdx / p += r[0] 1.304 + addq %r9, %rax 1.305 + adcq $0, %rdx / p += cy 1.306 + movq %rax, 0(%rdi) / r[0] = lo(p) 1.307 + movq %rdx, %r9 / cy = hi(p) 1.308 + decq %r8 1.309 + jz .L27 1.310 + 1.311 + movq 8(%rsi), %rax 1.312 + movq 8(%rdi), %r10 1.313 + mulq %rcx / p = a[1] * digit 1.314 + addq %r10, %rax 1.315 + adcq $0, %rdx / p += r[1] 1.316 + addq %r9, %rax 1.317 + adcq $0, %rdx / p += cy 1.318 + movq %rax, 8(%rdi) / r[1] = lo(p) 1.319 + movq %rdx, %r9 / cy = hi(p) 1.320 + decq %r8 1.321 + jz .L27 1.322 + 1.323 + movq 16(%rsi), %rax 1.324 + movq 16(%rdi), %r10 1.325 + mulq %rcx / p = a[2] * digit 1.326 + addq %r10, %rax 1.327 + adcq $0, %rdx / p += r[2] 1.328 + addq %r9, %rax 1.329 + adcq $0, %rdx / p += cy 1.330 + movq %rax, 16(%rdi) / r[2] = lo(p) 1.331 + movq %rdx, %r9 / cy = hi(p) 1.332 + decq %r8 1.333 + jz .L27 1.334 + 1.335 + movq 24(%rsi), %rax 1.336 + movq 24(%rdi), %r10 1.337 + mulq %rcx / p = a[3] * digit 1.338 + addq %r10, %rax 1.339 + adcq $0, %rdx / p += r[3] 1.340 + addq %r9, %rax 1.341 + adcq $0, %rdx / p += cy 1.342 + movq %rax, 24(%rdi) / r[3] = lo(p) 1.343 + movq %rdx, %r9 / cy = hi(p) 1.344 + decq %r8 1.345 + jz .L27 1.346 + 1.347 + movq 32(%rsi), %rax 1.348 + movq 32(%rdi), %r10 1.349 + mulq %rcx / p = a[4] * digit 1.350 + addq %r10, %rax 1.351 + adcq $0, %rdx / p += r[4] 1.352 + addq %r9, %rax 1.353 + adcq $0, %rdx / p += cy 1.354 + movq %rax, 32(%rdi) / r[4] = lo(p) 1.355 + movq %rdx, %r9 / cy = hi(p) 1.356 + decq %r8 1.357 + jz .L27 1.358 + 1.359 + movq 40(%rsi), %rax 1.360 + movq 40(%rdi), %r10 1.361 + mulq %rcx / p = a[5] * digit 1.362 + addq %r10, %rax 1.363 + adcq $0, %rdx / p += r[5] 1.364 + addq %r9, %rax 1.365 + adcq $0, %rdx / p += cy 1.366 + movq %rax, 40(%rdi) / r[5] = lo(p) 1.367 + movq %rdx, %r9 / cy = hi(p) 1.368 + decq %r8 1.369 + jz .L27 1.370 + 1.371 + movq 48(%rsi), %rax 1.372 + movq 48(%rdi), %r10 1.373 + mulq %rcx / p = a[6] * digit 1.374 + addq %r10, %rax 1.375 + adcq $0, %rdx / p += r[6] 1.376 + addq %r9, %rax 1.377 + adcq $0, %rdx / p += cy 1.378 + movq %rax, 48(%rdi) / r[6] = lo(p) 1.379 + movq %rdx, %r9 / cy = hi(p) 1.380 + decq %r8 1.381 + jz .L27 1.382 + 1.383 + 1.384 +.L27: 1.385 + movq %r9, %rax 1.386 + ret 1.387 + 1.388 +.size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64