security/nss/lib/freebl/mpi/mpi_amd64_masm.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ; This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 ; License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 4
michael@0 5 ;
michael@0 6 ; This code is converted from mpi_amd64_gas.asm for MASM for x64.
michael@0 7 ;
michael@0 8
michael@0 9 ; ------------------------------------------------------------------------
michael@0 10 ;
michael@0 11 ; Implementation of s_mpv_mul_set_vec which exploits
michael@0 12 ; the 64X64->128 bit unsigned multiply instruction.
michael@0 13 ;
michael@0 14 ; ------------------------------------------------------------------------
michael@0 15
michael@0 16 ; r = a * digit, r and a are vectors of length len
michael@0 17 ; returns the carry digit
michael@0 18 ; r and a are 64 bit aligned.
michael@0 19 ;
michael@0 20 ; uint64_t
michael@0 21 ; s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
michael@0 22 ;
michael@0 23
michael@0 24 .CODE
michael@0 25
michael@0 26 s_mpv_mul_set_vec64 PROC
michael@0 27
michael@0 28 ; compatibilities for paramenter registers
michael@0 29 ;
michael@0 30 ; About GAS and MASM, the usage of parameter registers are different.
michael@0 31
michael@0 32 push rdi
michael@0 33 push rsi
michael@0 34
michael@0 35 mov rdi, rcx
michael@0 36 mov rsi, rdx
michael@0 37 mov edx, r8d
michael@0 38 mov rcx, r9
michael@0 39
michael@0 40 xor rax, rax
michael@0 41 test rdx, rdx
michael@0 42 jz L17
michael@0 43 mov r8, rdx
michael@0 44 xor r9, r9
michael@0 45
michael@0 46 L15:
michael@0 47 cmp r8, 8
michael@0 48 jb L16
michael@0 49 mov rax, [rsi]
michael@0 50 mov r11, [8+rsi]
michael@0 51 mul rcx
michael@0 52 add rax, r9
michael@0 53 adc rdx, 0
michael@0 54 mov [0+rdi], rax
michael@0 55 mov r9, rdx
michael@0 56 mov rax,r11
michael@0 57 mov r11, [16+rsi]
michael@0 58 mul rcx
michael@0 59 add rax,r9
michael@0 60 adc rdx,0
michael@0 61 mov [8+rdi],rax
michael@0 62 mov r9,rdx
michael@0 63 mov rax,r11
michael@0 64 mov r11, [24+rsi]
michael@0 65 mul rcx
michael@0 66 add rax,r9
michael@0 67 adc rdx,0
michael@0 68 mov [16+rdi],rax
michael@0 69 mov r9,rdx
michael@0 70 mov rax,r11
michael@0 71 mov r11, [32+rsi]
michael@0 72 mul rcx
michael@0 73 add rax,r9
michael@0 74 adc rdx,0
michael@0 75 mov [24+rdi],rax
michael@0 76 mov r9,rdx
michael@0 77 mov rax,r11
michael@0 78 mov r11, [40+rsi]
michael@0 79 mul rcx
michael@0 80 add rax,r9
michael@0 81 adc rdx,0
michael@0 82 mov [32+rdi],rax
michael@0 83 mov r9,rdx
michael@0 84 mov rax,r11
michael@0 85 mov r11, [48+rsi]
michael@0 86 mul rcx
michael@0 87 add rax,r9
michael@0 88 adc rdx,0
michael@0 89 mov [40+rdi],rax
michael@0 90 mov r9,rdx
michael@0 91 mov rax,r11
michael@0 92 mov r11, [56+rsi]
michael@0 93 mul rcx
michael@0 94 add rax,r9
michael@0 95 adc rdx,0
michael@0 96 mov [48+rdi],rax
michael@0 97 mov r9,rdx
michael@0 98 mov rax,r11
michael@0 99 mul rcx
michael@0 100 add rax,r9
michael@0 101 adc rdx,0
michael@0 102 mov [56+rdi],rax
michael@0 103 mov r9,rdx
michael@0 104 add rsi, 64
michael@0 105 add rdi, 64
michael@0 106 sub r8, 8
michael@0 107 jz L17
michael@0 108 jmp L15
michael@0 109
michael@0 110 L16:
michael@0 111 mov rax, [0+rsi]
michael@0 112 mul rcx
michael@0 113 add rax, r9
michael@0 114 adc rdx,0
michael@0 115 mov [0+rdi],rax
michael@0 116 mov r9,rdx
michael@0 117 dec r8
michael@0 118 jz L17
michael@0 119 mov rax, [8+rsi]
michael@0 120 mul rcx
michael@0 121 add rax,r9
michael@0 122 adc rdx,0
michael@0 123 mov [8+rdi], rax
michael@0 124 mov r9, rdx
michael@0 125 dec r8
michael@0 126 jz L17
michael@0 127 mov rax, [16+rsi]
michael@0 128 mul rcx
michael@0 129 add rax, r9
michael@0 130 adc rdx, 0
michael@0 131 mov [16+rdi],rax
michael@0 132 mov r9,rdx
michael@0 133 dec r8
michael@0 134 jz L17
michael@0 135 mov rax, [24+rsi]
michael@0 136 mul rcx
michael@0 137 add rax, r9
michael@0 138 adc rdx, 0
michael@0 139 mov [24+rdi], rax
michael@0 140 mov r9, rdx
michael@0 141 dec r8
michael@0 142 jz L17
michael@0 143 mov rax, [32+rsi]
michael@0 144 mul rcx
michael@0 145 add rax, r9
michael@0 146 adc rdx, 0
michael@0 147 mov [32+rdi],rax
michael@0 148 mov r9, rdx
michael@0 149 dec r8
michael@0 150 jz L17
michael@0 151 mov rax, [40+rsi]
michael@0 152 mul rcx
michael@0 153 add rax, r9
michael@0 154 adc rdx, 0
michael@0 155 mov [40+rdi], rax
michael@0 156 mov r9, rdx
michael@0 157 dec r8
michael@0 158 jz L17
michael@0 159 mov rax, [48+rsi]
michael@0 160 mul rcx
michael@0 161 add rax, r9
michael@0 162 adc rdx, 0
michael@0 163 mov [48+rdi], rax
michael@0 164 mov r9, rdx
michael@0 165 dec r8
michael@0 166 jz L17
michael@0 167
michael@0 168 L17:
michael@0 169 mov rax, r9
michael@0 170 pop rsi
michael@0 171 pop rdi
michael@0 172 ret
michael@0 173
michael@0 174 s_mpv_mul_set_vec64 ENDP
michael@0 175
michael@0 176
michael@0 177 ;------------------------------------------------------------------------
michael@0 178 ;
michael@0 179 ; Implementation of s_mpv_mul_add_vec which exploits
michael@0 180 ; the 64X64->128 bit unsigned multiply instruction.
michael@0 181 ;
michael@0 182 ;------------------------------------------------------------------------
michael@0 183
michael@0 184 ; r += a * digit, r and a are vectors of length len
michael@0 185 ; returns the carry digit
michael@0 186 ; r and a are 64 bit aligned.
michael@0 187 ;
michael@0 188 ; uint64_t
michael@0 189 ; s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
michael@0 190 ;
michael@0 191
michael@0 192 s_mpv_mul_add_vec64 PROC
michael@0 193
michael@0 194 ; compatibilities for paramenter registers
michael@0 195 ;
michael@0 196 ; About GAS and MASM, the usage of parameter registers are different.
michael@0 197
michael@0 198 push rdi
michael@0 199 push rsi
michael@0 200
michael@0 201 mov rdi, rcx
michael@0 202 mov rsi, rdx
michael@0 203 mov edx, r8d
michael@0 204 mov rcx, r9
michael@0 205
michael@0 206 xor rax, rax
michael@0 207 test rdx, rdx
michael@0 208 jz L27
michael@0 209 mov r8, rdx
michael@0 210 xor r9, r9
michael@0 211
michael@0 212 L25:
michael@0 213 cmp r8, 8
michael@0 214 jb L26
michael@0 215 mov rax, [0+rsi]
michael@0 216 mov r10, [0+rdi]
michael@0 217 mov r11, [8+rsi]
michael@0 218 mul rcx
michael@0 219 add rax,r10
michael@0 220 adc rdx,0
michael@0 221 mov r10, [8+rdi]
michael@0 222 add rax,r9
michael@0 223 adc rdx,0
michael@0 224 mov [0+rdi],rax
michael@0 225 mov r9,rdx
michael@0 226 mov rax,r11
michael@0 227 mov r11, [16+rsi]
michael@0 228 mul rcx
michael@0 229 add rax,r10
michael@0 230 adc rdx,0
michael@0 231 mov r10, [16+rdi]
michael@0 232 add rax,r9
michael@0 233 adc rdx,0
michael@0 234 mov [8+rdi],rax
michael@0 235 mov r9,rdx
michael@0 236 mov rax,r11
michael@0 237 mov r11, [24+rsi]
michael@0 238 mul rcx
michael@0 239 add rax,r10
michael@0 240 adc rdx,0
michael@0 241 mov r10, [24+rdi]
michael@0 242 add rax,r9
michael@0 243 adc rdx,0
michael@0 244 mov [16+rdi],rax
michael@0 245 mov r9,rdx
michael@0 246 mov rax,r11
michael@0 247 mov r11, [32+rsi]
michael@0 248 mul rcx
michael@0 249 add rax,r10
michael@0 250 adc rdx,0
michael@0 251 mov r10, [32+rdi]
michael@0 252 add rax,r9
michael@0 253 adc rdx,0
michael@0 254 mov [24+rdi],rax
michael@0 255 mov r9,rdx
michael@0 256 mov rax,r11
michael@0 257 mov r11, [40+rsi]
michael@0 258 mul rcx
michael@0 259 add rax,r10
michael@0 260 adc rdx,0
michael@0 261 mov r10, [40+rdi]
michael@0 262 add rax,r9
michael@0 263 adc rdx,0
michael@0 264 mov [32+rdi],rax
michael@0 265 mov r9,rdx
michael@0 266 mov rax,r11
michael@0 267 mov r11, [48+rsi]
michael@0 268 mul rcx
michael@0 269 add rax,r10
michael@0 270 adc rdx,0
michael@0 271 mov r10, [48+rdi]
michael@0 272 add rax,r9
michael@0 273 adc rdx,0
michael@0 274 mov [40+rdi],rax
michael@0 275 mov r9,rdx
michael@0 276 mov rax,r11
michael@0 277 mov r11, [56+rsi]
michael@0 278 mul rcx
michael@0 279 add rax,r10
michael@0 280 adc rdx,0
michael@0 281 mov r10, [56+rdi]
michael@0 282 add rax,r9
michael@0 283 adc rdx,0
michael@0 284 mov [48+rdi],rax
michael@0 285 mov r9,rdx
michael@0 286 mov rax,r11
michael@0 287 mul rcx
michael@0 288 add rax,r10
michael@0 289 adc rdx,0
michael@0 290 add rax,r9
michael@0 291 adc rdx,0
michael@0 292 mov [56+rdi],rax
michael@0 293 mov r9,rdx
michael@0 294 add rsi,64
michael@0 295 add rdi,64
michael@0 296 sub r8, 8
michael@0 297 jz L27
michael@0 298 jmp L25
michael@0 299
michael@0 300 L26:
michael@0 301 mov rax, [0+rsi]
michael@0 302 mov r10, [0+rdi]
michael@0 303 mul rcx
michael@0 304 add rax,r10
michael@0 305 adc rdx,0
michael@0 306 add rax,r9
michael@0 307 adc rdx,0
michael@0 308 mov [0+rdi],rax
michael@0 309 mov r9,rdx
michael@0 310 dec r8
michael@0 311 jz L27
michael@0 312 mov rax, [8+rsi]
michael@0 313 mov r10, [8+rdi]
michael@0 314 mul rcx
michael@0 315 add rax,r10
michael@0 316 adc rdx,0
michael@0 317 add rax,r9
michael@0 318 adc rdx,0
michael@0 319 mov [8+rdi],rax
michael@0 320 mov r9,rdx
michael@0 321 dec r8
michael@0 322 jz L27
michael@0 323 mov rax, [16+rsi]
michael@0 324 mov r10, [16+rdi]
michael@0 325 mul rcx
michael@0 326 add rax,r10
michael@0 327 adc rdx,0
michael@0 328 add rax,r9
michael@0 329 adc rdx,0
michael@0 330 mov [16+rdi],rax
michael@0 331 mov r9,rdx
michael@0 332 dec r8
michael@0 333 jz L27
michael@0 334 mov rax, [24+rsi]
michael@0 335 mov r10, [24+rdi]
michael@0 336 mul rcx
michael@0 337 add rax,r10
michael@0 338 adc rdx,0
michael@0 339 add rax,r9
michael@0 340 adc rdx,0
michael@0 341 mov [24+rdi],rax
michael@0 342 mov r9,rdx
michael@0 343 dec r8
michael@0 344 jz L27
michael@0 345 mov rax, [32+rsi]
michael@0 346 mov r10, [32+rdi]
michael@0 347 mul rcx
michael@0 348 add rax,r10
michael@0 349 adc rdx,0
michael@0 350 add rax,r9
michael@0 351 adc rdx,0
michael@0 352 mov [32+rdi],rax
michael@0 353 mov r9,rdx
michael@0 354 dec r8
michael@0 355 jz L27
michael@0 356 mov rax, [40+rsi]
michael@0 357 mov r10, [40+rdi]
michael@0 358 mul rcx
michael@0 359 add rax,r10
michael@0 360 adc rdx,0
michael@0 361 add rax,r9
michael@0 362 adc rdx,0
michael@0 363 mov [40+rdi],rax
michael@0 364 mov r9,rdx
michael@0 365 dec r8
michael@0 366 jz L27
michael@0 367 mov rax, [48+rsi]
michael@0 368 mov r10, [48+rdi]
michael@0 369 mul rcx
michael@0 370 add rax,r10
michael@0 371 adc rdx,0
michael@0 372 add rax, r9
michael@0 373 adc rdx, 0
michael@0 374 mov [48+rdi], rax
michael@0 375 mov r9, rdx
michael@0 376 dec r8
michael@0 377 jz L27
michael@0 378
michael@0 379 L27:
michael@0 380 mov rax, r9
michael@0 381
michael@0 382 pop rsi
michael@0 383 pop rdi
michael@0 384 ret
michael@0 385
michael@0 386 s_mpv_mul_add_vec64 ENDP
michael@0 387
michael@0 388 END

mercurial