1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpi_amd64_masm.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,388 @@ 1.4 +; This Source Code Form is subject to the terms of the Mozilla Public 1.5 +; License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +; file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 +; 1.9 +; This code is converted from mpi_amd64_gas.asm for MASM for x64. 1.10 +; 1.11 + 1.12 +; ------------------------------------------------------------------------ 1.13 +; 1.14 +; Implementation of s_mpv_mul_set_vec which exploits 1.15 +; the 64X64->128 bit unsigned multiply instruction. 1.16 +; 1.17 +; ------------------------------------------------------------------------ 1.18 + 1.19 +; r = a * digit, r and a are vectors of length len 1.20 +; returns the carry digit 1.21 +; r and a are 64 bit aligned. 1.22 +; 1.23 +; uint64_t 1.24 +; s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 1.25 +; 1.26 + 1.27 +.CODE 1.28 + 1.29 +s_mpv_mul_set_vec64 PROC 1.30 + 1.31 + ; compatibilities for paramenter registers 1.32 + ; 1.33 + ; About GAS and MASM, the usage of parameter registers are different. 1.34 + 1.35 + push rdi 1.36 + push rsi 1.37 + 1.38 + mov rdi, rcx 1.39 + mov rsi, rdx 1.40 + mov edx, r8d 1.41 + mov rcx, r9 1.42 + 1.43 + xor rax, rax 1.44 + test rdx, rdx 1.45 + jz L17 1.46 + mov r8, rdx 1.47 + xor r9, r9 1.48 + 1.49 +L15: 1.50 + cmp r8, 8 1.51 + jb L16 1.52 + mov rax, [rsi] 1.53 + mov r11, [8+rsi] 1.54 + mul rcx 1.55 + add rax, r9 1.56 + adc rdx, 0 1.57 + mov [0+rdi], rax 1.58 + mov r9, rdx 1.59 + mov rax,r11 1.60 + mov r11, [16+rsi] 1.61 + mul rcx 1.62 + add rax,r9 1.63 + adc rdx,0 1.64 + mov [8+rdi],rax 1.65 + mov r9,rdx 1.66 + mov rax,r11 1.67 + mov r11, [24+rsi] 1.68 + mul rcx 1.69 + add rax,r9 1.70 + adc rdx,0 1.71 + mov [16+rdi],rax 1.72 + mov r9,rdx 1.73 + mov rax,r11 1.74 + mov r11, [32+rsi] 1.75 + mul rcx 1.76 + add rax,r9 1.77 + adc rdx,0 1.78 + mov [24+rdi],rax 1.79 + mov r9,rdx 1.80 + mov rax,r11 1.81 + mov r11, [40+rsi] 1.82 + mul rcx 1.83 + add rax,r9 1.84 + adc rdx,0 1.85 + mov [32+rdi],rax 1.86 + mov r9,rdx 1.87 + mov rax,r11 1.88 + mov r11, [48+rsi] 1.89 + mul rcx 1.90 + add rax,r9 1.91 + adc rdx,0 1.92 + mov [40+rdi],rax 1.93 + mov r9,rdx 1.94 + mov rax,r11 1.95 + mov r11, [56+rsi] 1.96 + mul rcx 1.97 + add rax,r9 1.98 + adc rdx,0 1.99 + mov [48+rdi],rax 1.100 + mov r9,rdx 1.101 + mov rax,r11 1.102 + mul rcx 1.103 + add rax,r9 1.104 + adc rdx,0 1.105 + mov [56+rdi],rax 1.106 + mov r9,rdx 1.107 + add rsi, 64 1.108 + add rdi, 64 1.109 + sub r8, 8 1.110 + jz L17 1.111 + jmp L15 1.112 + 1.113 +L16: 1.114 + mov rax, [0+rsi] 1.115 + mul rcx 1.116 + add rax, r9 1.117 + adc rdx,0 1.118 + mov [0+rdi],rax 1.119 + mov r9,rdx 1.120 + dec r8 1.121 + jz L17 1.122 + mov rax, [8+rsi] 1.123 + mul rcx 1.124 + add rax,r9 1.125 + adc rdx,0 1.126 + mov [8+rdi], rax 1.127 + mov r9, rdx 1.128 + dec r8 1.129 + jz L17 1.130 + mov rax, [16+rsi] 1.131 + mul rcx 1.132 + add rax, r9 1.133 + adc rdx, 0 1.134 + mov [16+rdi],rax 1.135 + mov r9,rdx 1.136 + dec r8 1.137 + jz L17 1.138 + mov rax, [24+rsi] 1.139 + mul rcx 1.140 + add rax, r9 1.141 + adc rdx, 0 1.142 + mov [24+rdi], rax 1.143 + mov r9, rdx 1.144 + dec r8 1.145 + jz L17 1.146 + mov rax, [32+rsi] 1.147 + mul rcx 1.148 + add rax, r9 1.149 + adc rdx, 0 1.150 + mov [32+rdi],rax 1.151 + mov r9, rdx 1.152 + dec r8 1.153 + jz L17 1.154 + mov rax, [40+rsi] 1.155 + mul rcx 1.156 + add rax, r9 1.157 + adc rdx, 0 1.158 + mov [40+rdi], rax 1.159 + mov r9, rdx 1.160 + dec r8 1.161 + jz L17 1.162 + mov rax, [48+rsi] 1.163 + mul rcx 1.164 + add rax, r9 1.165 + adc rdx, 0 1.166 + mov [48+rdi], rax 1.167 + mov r9, rdx 1.168 + dec r8 1.169 + jz L17 1.170 + 1.171 +L17: 1.172 + mov rax, r9 1.173 + pop rsi 1.174 + pop rdi 1.175 + ret 1.176 + 1.177 +s_mpv_mul_set_vec64 ENDP 1.178 + 1.179 + 1.180 +;------------------------------------------------------------------------ 1.181 +; 1.182 +; Implementation of s_mpv_mul_add_vec which exploits 1.183 +; the 64X64->128 bit unsigned multiply instruction. 1.184 +; 1.185 +;------------------------------------------------------------------------ 1.186 + 1.187 +; r += a * digit, r and a are vectors of length len 1.188 +; returns the carry digit 1.189 +; r and a are 64 bit aligned. 1.190 +; 1.191 +; uint64_t 1.192 +; s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 1.193 +; 1.194 + 1.195 +s_mpv_mul_add_vec64 PROC 1.196 + 1.197 + ; compatibilities for paramenter registers 1.198 + ; 1.199 + ; About GAS and MASM, the usage of parameter registers are different. 1.200 + 1.201 + push rdi 1.202 + push rsi 1.203 + 1.204 + mov rdi, rcx 1.205 + mov rsi, rdx 1.206 + mov edx, r8d 1.207 + mov rcx, r9 1.208 + 1.209 + xor rax, rax 1.210 + test rdx, rdx 1.211 + jz L27 1.212 + mov r8, rdx 1.213 + xor r9, r9 1.214 + 1.215 +L25: 1.216 + cmp r8, 8 1.217 + jb L26 1.218 + mov rax, [0+rsi] 1.219 + mov r10, [0+rdi] 1.220 + mov r11, [8+rsi] 1.221 + mul rcx 1.222 + add rax,r10 1.223 + adc rdx,0 1.224 + mov r10, [8+rdi] 1.225 + add rax,r9 1.226 + adc rdx,0 1.227 + mov [0+rdi],rax 1.228 + mov r9,rdx 1.229 + mov rax,r11 1.230 + mov r11, [16+rsi] 1.231 + mul rcx 1.232 + add rax,r10 1.233 + adc rdx,0 1.234 + mov r10, [16+rdi] 1.235 + add rax,r9 1.236 + adc rdx,0 1.237 + mov [8+rdi],rax 1.238 + mov r9,rdx 1.239 + mov rax,r11 1.240 + mov r11, [24+rsi] 1.241 + mul rcx 1.242 + add rax,r10 1.243 + adc rdx,0 1.244 + mov r10, [24+rdi] 1.245 + add rax,r9 1.246 + adc rdx,0 1.247 + mov [16+rdi],rax 1.248 + mov r9,rdx 1.249 + mov rax,r11 1.250 + mov r11, [32+rsi] 1.251 + mul rcx 1.252 + add rax,r10 1.253 + adc rdx,0 1.254 + mov r10, [32+rdi] 1.255 + add rax,r9 1.256 + adc rdx,0 1.257 + mov [24+rdi],rax 1.258 + mov r9,rdx 1.259 + mov rax,r11 1.260 + mov r11, [40+rsi] 1.261 + mul rcx 1.262 + add rax,r10 1.263 + adc rdx,0 1.264 + mov r10, [40+rdi] 1.265 + add rax,r9 1.266 + adc rdx,0 1.267 + mov [32+rdi],rax 1.268 + mov r9,rdx 1.269 + mov rax,r11 1.270 + mov r11, [48+rsi] 1.271 + mul rcx 1.272 + add rax,r10 1.273 + adc rdx,0 1.274 + mov r10, [48+rdi] 1.275 + add rax,r9 1.276 + adc rdx,0 1.277 + mov [40+rdi],rax 1.278 + mov r9,rdx 1.279 + mov rax,r11 1.280 + mov r11, [56+rsi] 1.281 + mul rcx 1.282 + add rax,r10 1.283 + adc rdx,0 1.284 + mov r10, [56+rdi] 1.285 + add rax,r9 1.286 + adc rdx,0 1.287 + mov [48+rdi],rax 1.288 + mov r9,rdx 1.289 + mov rax,r11 1.290 + mul rcx 1.291 + add rax,r10 1.292 + adc rdx,0 1.293 + add rax,r9 1.294 + adc rdx,0 1.295 + mov [56+rdi],rax 1.296 + mov r9,rdx 1.297 + add rsi,64 1.298 + add rdi,64 1.299 + sub r8, 8 1.300 + jz L27 1.301 + jmp L25 1.302 + 1.303 +L26: 1.304 + mov rax, [0+rsi] 1.305 + mov r10, [0+rdi] 1.306 + mul rcx 1.307 + add rax,r10 1.308 + adc rdx,0 1.309 + add rax,r9 1.310 + adc rdx,0 1.311 + mov [0+rdi],rax 1.312 + mov r9,rdx 1.313 + dec r8 1.314 + jz L27 1.315 + mov rax, [8+rsi] 1.316 + mov r10, [8+rdi] 1.317 + mul rcx 1.318 + add rax,r10 1.319 + adc rdx,0 1.320 + add rax,r9 1.321 + adc rdx,0 1.322 + mov [8+rdi],rax 1.323 + mov r9,rdx 1.324 + dec r8 1.325 + jz L27 1.326 + mov rax, [16+rsi] 1.327 + mov r10, [16+rdi] 1.328 + mul rcx 1.329 + add rax,r10 1.330 + adc rdx,0 1.331 + add rax,r9 1.332 + adc rdx,0 1.333 + mov [16+rdi],rax 1.334 + mov r9,rdx 1.335 + dec r8 1.336 + jz L27 1.337 + mov rax, [24+rsi] 1.338 + mov r10, [24+rdi] 1.339 + mul rcx 1.340 + add rax,r10 1.341 + adc rdx,0 1.342 + add rax,r9 1.343 + adc rdx,0 1.344 + mov [24+rdi],rax 1.345 + mov r9,rdx 1.346 + dec r8 1.347 + jz L27 1.348 + mov rax, [32+rsi] 1.349 + mov r10, [32+rdi] 1.350 + mul rcx 1.351 + add rax,r10 1.352 + adc rdx,0 1.353 + add rax,r9 1.354 + adc rdx,0 1.355 + mov [32+rdi],rax 1.356 + mov r9,rdx 1.357 + dec r8 1.358 + jz L27 1.359 + mov rax, [40+rsi] 1.360 + mov r10, [40+rdi] 1.361 + mul rcx 1.362 + add rax,r10 1.363 + adc rdx,0 1.364 + add rax,r9 1.365 + adc rdx,0 1.366 + mov [40+rdi],rax 1.367 + mov r9,rdx 1.368 + dec r8 1.369 + jz L27 1.370 + mov rax, [48+rsi] 1.371 + mov r10, [48+rdi] 1.372 + mul rcx 1.373 + add rax,r10 1.374 + adc rdx,0 1.375 + add rax, r9 1.376 + adc rdx, 0 1.377 + mov [48+rdi], rax 1.378 + mov r9, rdx 1.379 + dec r8 1.380 + jz L27 1.381 + 1.382 +L27: 1.383 + mov rax, r9 1.384 + 1.385 + pop rsi 1.386 + pop rdi 1.387 + ret 1.388 + 1.389 +s_mpv_mul_add_vec64 ENDP 1.390 + 1.391 +END