security/nss/lib/freebl/mpi/mpi_amd64_gas.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 4
michael@0 5
michael@0 6 # ------------------------------------------------------------------------
michael@0 7 #
michael@0 8 # Implementation of s_mpv_mul_set_vec which exploits
michael@0 9 # the 64X64->128 bit unsigned multiply instruction.
michael@0 10 #
michael@0 11 # ------------------------------------------------------------------------
michael@0 12
michael@0 13 # r = a * digit, r and a are vectors of length len
michael@0 14 # returns the carry digit
michael@0 15 # r and a are 64 bit aligned.
michael@0 16 #
michael@0 17 # uint64_t
michael@0 18 # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
michael@0 19 #
michael@0 20
michael@0 21 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
michael@0 22
michael@0 23 xorq %rax, %rax # if (len == 0) return (0)
michael@0 24 testq %rdx, %rdx
michael@0 25 jz .L17
michael@0 26
michael@0 27 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
michael@0 28 xorq %r9, %r9 # cy = 0
michael@0 29
michael@0 30 .L15:
michael@0 31 cmpq $8, %r8 # 8 - len
michael@0 32 jb .L16
michael@0 33 movq 0(%rsi), %rax # rax = a[0]
michael@0 34 movq 8(%rsi), %r11 # prefetch a[1]
michael@0 35 mulq %rcx # p = a[0] * digit
michael@0 36 addq %r9, %rax
michael@0 37 adcq $0, %rdx # p += cy
michael@0 38 movq %rax, 0(%rdi) # r[0] = lo(p)
michael@0 39 movq %rdx, %r9 # cy = hi(p)
michael@0 40
michael@0 41 movq %r11, %rax
michael@0 42 movq 16(%rsi), %r11 # prefetch a[2]
michael@0 43 mulq %rcx # p = a[1] * digit
michael@0 44 addq %r9, %rax
michael@0 45 adcq $0, %rdx # p += cy
michael@0 46 movq %rax, 8(%rdi) # r[1] = lo(p)
michael@0 47 movq %rdx, %r9 # cy = hi(p)
michael@0 48
michael@0 49 movq %r11, %rax
michael@0 50 movq 24(%rsi), %r11 # prefetch a[3]
michael@0 51 mulq %rcx # p = a[2] * digit
michael@0 52 addq %r9, %rax
michael@0 53 adcq $0, %rdx # p += cy
michael@0 54 movq %rax, 16(%rdi) # r[2] = lo(p)
michael@0 55 movq %rdx, %r9 # cy = hi(p)
michael@0 56
michael@0 57 movq %r11, %rax
michael@0 58 movq 32(%rsi), %r11 # prefetch a[4]
michael@0 59 mulq %rcx # p = a[3] * digit
michael@0 60 addq %r9, %rax
michael@0 61 adcq $0, %rdx # p += cy
michael@0 62 movq %rax, 24(%rdi) # r[3] = lo(p)
michael@0 63 movq %rdx, %r9 # cy = hi(p)
michael@0 64
michael@0 65 movq %r11, %rax
michael@0 66 movq 40(%rsi), %r11 # prefetch a[5]
michael@0 67 mulq %rcx # p = a[4] * digit
michael@0 68 addq %r9, %rax
michael@0 69 adcq $0, %rdx # p += cy
michael@0 70 movq %rax, 32(%rdi) # r[4] = lo(p)
michael@0 71 movq %rdx, %r9 # cy = hi(p)
michael@0 72
michael@0 73 movq %r11, %rax
michael@0 74 movq 48(%rsi), %r11 # prefetch a[6]
michael@0 75 mulq %rcx # p = a[5] * digit
michael@0 76 addq %r9, %rax
michael@0 77 adcq $0, %rdx # p += cy
michael@0 78 movq %rax, 40(%rdi) # r[5] = lo(p)
michael@0 79 movq %rdx, %r9 # cy = hi(p)
michael@0 80
michael@0 81 movq %r11, %rax
michael@0 82 movq 56(%rsi), %r11 # prefetch a[7]
michael@0 83 mulq %rcx # p = a[6] * digit
michael@0 84 addq %r9, %rax
michael@0 85 adcq $0, %rdx # p += cy
michael@0 86 movq %rax, 48(%rdi) # r[6] = lo(p)
michael@0 87 movq %rdx, %r9 # cy = hi(p)
michael@0 88
michael@0 89 movq %r11, %rax
michael@0 90 mulq %rcx # p = a[7] * digit
michael@0 91 addq %r9, %rax
michael@0 92 adcq $0, %rdx # p += cy
michael@0 93 movq %rax, 56(%rdi) # r[7] = lo(p)
michael@0 94 movq %rdx, %r9 # cy = hi(p)
michael@0 95
michael@0 96 addq $64, %rsi
michael@0 97 addq $64, %rdi
michael@0 98 subq $8, %r8
michael@0 99
michael@0 100 jz .L17
michael@0 101 jmp .L15
michael@0 102
michael@0 103 .L16:
michael@0 104 movq 0(%rsi), %rax
michael@0 105 mulq %rcx # p = a[0] * digit
michael@0 106 addq %r9, %rax
michael@0 107 adcq $0, %rdx # p += cy
michael@0 108 movq %rax, 0(%rdi) # r[0] = lo(p)
michael@0 109 movq %rdx, %r9 # cy = hi(p)
michael@0 110 decq %r8
michael@0 111 jz .L17
michael@0 112
michael@0 113 movq 8(%rsi), %rax
michael@0 114 mulq %rcx # p = a[1] * digit
michael@0 115 addq %r9, %rax
michael@0 116 adcq $0, %rdx # p += cy
michael@0 117 movq %rax, 8(%rdi) # r[1] = lo(p)
michael@0 118 movq %rdx, %r9 # cy = hi(p)
michael@0 119 decq %r8
michael@0 120 jz .L17
michael@0 121
michael@0 122 movq 16(%rsi), %rax
michael@0 123 mulq %rcx # p = a[2] * digit
michael@0 124 addq %r9, %rax
michael@0 125 adcq $0, %rdx # p += cy
michael@0 126 movq %rax, 16(%rdi) # r[2] = lo(p)
michael@0 127 movq %rdx, %r9 # cy = hi(p)
michael@0 128 decq %r8
michael@0 129 jz .L17
michael@0 130
michael@0 131 movq 24(%rsi), %rax
michael@0 132 mulq %rcx # p = a[3] * digit
michael@0 133 addq %r9, %rax
michael@0 134 adcq $0, %rdx # p += cy
michael@0 135 movq %rax, 24(%rdi) # r[3] = lo(p)
michael@0 136 movq %rdx, %r9 # cy = hi(p)
michael@0 137 decq %r8
michael@0 138 jz .L17
michael@0 139
michael@0 140 movq 32(%rsi), %rax
michael@0 141 mulq %rcx # p = a[4] * digit
michael@0 142 addq %r9, %rax
michael@0 143 adcq $0, %rdx # p += cy
michael@0 144 movq %rax, 32(%rdi) # r[4] = lo(p)
michael@0 145 movq %rdx, %r9 # cy = hi(p)
michael@0 146 decq %r8
michael@0 147 jz .L17
michael@0 148
michael@0 149 movq 40(%rsi), %rax
michael@0 150 mulq %rcx # p = a[5] * digit
michael@0 151 addq %r9, %rax
michael@0 152 adcq $0, %rdx # p += cy
michael@0 153 movq %rax, 40(%rdi) # r[5] = lo(p)
michael@0 154 movq %rdx, %r9 # cy = hi(p)
michael@0 155 decq %r8
michael@0 156 jz .L17
michael@0 157
michael@0 158 movq 48(%rsi), %rax
michael@0 159 mulq %rcx # p = a[6] * digit
michael@0 160 addq %r9, %rax
michael@0 161 adcq $0, %rdx # p += cy
michael@0 162 movq %rax, 48(%rdi) # r[6] = lo(p)
michael@0 163 movq %rdx, %r9 # cy = hi(p)
michael@0 164 decq %r8
michael@0 165 jz .L17
michael@0 166
michael@0 167
michael@0 168 .L17:
michael@0 169 movq %r9, %rax
michael@0 170 ret
michael@0 171
michael@0 172 .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
michael@0 173
michael@0 174 # ------------------------------------------------------------------------
michael@0 175 #
michael@0 176 # Implementation of s_mpv_mul_add_vec which exploits
michael@0 177 # the 64X64->128 bit unsigned multiply instruction.
michael@0 178 #
michael@0 179 # ------------------------------------------------------------------------
michael@0 180
michael@0 181 # r += a * digit, r and a are vectors of length len
michael@0 182 # returns the carry digit
michael@0 183 # r and a are 64 bit aligned.
michael@0 184 #
michael@0 185 # uint64_t
michael@0 186 # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
michael@0 187 #
michael@0 188
michael@0 189 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
michael@0 190
michael@0 191 xorq %rax, %rax # if (len == 0) return (0)
michael@0 192 testq %rdx, %rdx
michael@0 193 jz .L27
michael@0 194
michael@0 195 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
michael@0 196 xorq %r9, %r9 # cy = 0
michael@0 197
michael@0 198 .L25:
michael@0 199 cmpq $8, %r8 # 8 - len
michael@0 200 jb .L26
michael@0 201 movq 0(%rsi), %rax # rax = a[0]
michael@0 202 movq 0(%rdi), %r10 # r10 = r[0]
michael@0 203 movq 8(%rsi), %r11 # prefetch a[1]
michael@0 204 mulq %rcx # p = a[0] * digit
michael@0 205 addq %r10, %rax
michael@0 206 adcq $0, %rdx # p += r[0]
michael@0 207 movq 8(%rdi), %r10 # prefetch r[1]
michael@0 208 addq %r9, %rax
michael@0 209 adcq $0, %rdx # p += cy
michael@0 210 movq %rax, 0(%rdi) # r[0] = lo(p)
michael@0 211 movq %rdx, %r9 # cy = hi(p)
michael@0 212
michael@0 213 movq %r11, %rax
michael@0 214 movq 16(%rsi), %r11 # prefetch a[2]
michael@0 215 mulq %rcx # p = a[1] * digit
michael@0 216 addq %r10, %rax
michael@0 217 adcq $0, %rdx # p += r[1]
michael@0 218 movq 16(%rdi), %r10 # prefetch r[2]
michael@0 219 addq %r9, %rax
michael@0 220 adcq $0, %rdx # p += cy
michael@0 221 movq %rax, 8(%rdi) # r[1] = lo(p)
michael@0 222 movq %rdx, %r9 # cy = hi(p)
michael@0 223
michael@0 224 movq %r11, %rax
michael@0 225 movq 24(%rsi), %r11 # prefetch a[3]
michael@0 226 mulq %rcx # p = a[2] * digit
michael@0 227 addq %r10, %rax
michael@0 228 adcq $0, %rdx # p += r[2]
michael@0 229 movq 24(%rdi), %r10 # prefetch r[3]
michael@0 230 addq %r9, %rax
michael@0 231 adcq $0, %rdx # p += cy
michael@0 232 movq %rax, 16(%rdi) # r[2] = lo(p)
michael@0 233 movq %rdx, %r9 # cy = hi(p)
michael@0 234
michael@0 235 movq %r11, %rax
michael@0 236 movq 32(%rsi), %r11 # prefetch a[4]
michael@0 237 mulq %rcx # p = a[3] * digit
michael@0 238 addq %r10, %rax
michael@0 239 adcq $0, %rdx # p += r[3]
michael@0 240 movq 32(%rdi), %r10 # prefetch r[4]
michael@0 241 addq %r9, %rax
michael@0 242 adcq $0, %rdx # p += cy
michael@0 243 movq %rax, 24(%rdi) # r[3] = lo(p)
michael@0 244 movq %rdx, %r9 # cy = hi(p)
michael@0 245
michael@0 246 movq %r11, %rax
michael@0 247 movq 40(%rsi), %r11 # prefetch a[5]
michael@0 248 mulq %rcx # p = a[4] * digit
michael@0 249 addq %r10, %rax
michael@0 250 adcq $0, %rdx # p += r[4]
michael@0 251 movq 40(%rdi), %r10 # prefetch r[5]
michael@0 252 addq %r9, %rax
michael@0 253 adcq $0, %rdx # p += cy
michael@0 254 movq %rax, 32(%rdi) # r[4] = lo(p)
michael@0 255 movq %rdx, %r9 # cy = hi(p)
michael@0 256
michael@0 257 movq %r11, %rax
michael@0 258 movq 48(%rsi), %r11 # prefetch a[6]
michael@0 259 mulq %rcx # p = a[5] * digit
michael@0 260 addq %r10, %rax
michael@0 261 adcq $0, %rdx # p += r[5]
michael@0 262 movq 48(%rdi), %r10 # prefetch r[6]
michael@0 263 addq %r9, %rax
michael@0 264 adcq $0, %rdx # p += cy
michael@0 265 movq %rax, 40(%rdi) # r[5] = lo(p)
michael@0 266 movq %rdx, %r9 # cy = hi(p)
michael@0 267
michael@0 268 movq %r11, %rax
michael@0 269 movq 56(%rsi), %r11 # prefetch a[7]
michael@0 270 mulq %rcx # p = a[6] * digit
michael@0 271 addq %r10, %rax
michael@0 272 adcq $0, %rdx # p += r[6]
michael@0 273 movq 56(%rdi), %r10 # prefetch r[7]
michael@0 274 addq %r9, %rax
michael@0 275 adcq $0, %rdx # p += cy
michael@0 276 movq %rax, 48(%rdi) # r[6] = lo(p)
michael@0 277 movq %rdx, %r9 # cy = hi(p)
michael@0 278
michael@0 279 movq %r11, %rax
michael@0 280 mulq %rcx # p = a[7] * digit
michael@0 281 addq %r10, %rax
michael@0 282 adcq $0, %rdx # p += r[7]
michael@0 283 addq %r9, %rax
michael@0 284 adcq $0, %rdx # p += cy
michael@0 285 movq %rax, 56(%rdi) # r[7] = lo(p)
michael@0 286 movq %rdx, %r9 # cy = hi(p)
michael@0 287
michael@0 288 addq $64, %rsi
michael@0 289 addq $64, %rdi
michael@0 290 subq $8, %r8
michael@0 291
michael@0 292 jz .L27
michael@0 293 jmp .L25
michael@0 294
michael@0 295 .L26:
michael@0 296 movq 0(%rsi), %rax
michael@0 297 movq 0(%rdi), %r10
michael@0 298 mulq %rcx # p = a[0] * digit
michael@0 299 addq %r10, %rax
michael@0 300 adcq $0, %rdx # p += r[0]
michael@0 301 addq %r9, %rax
michael@0 302 adcq $0, %rdx # p += cy
michael@0 303 movq %rax, 0(%rdi) # r[0] = lo(p)
michael@0 304 movq %rdx, %r9 # cy = hi(p)
michael@0 305 decq %r8
michael@0 306 jz .L27
michael@0 307
michael@0 308 movq 8(%rsi), %rax
michael@0 309 movq 8(%rdi), %r10
michael@0 310 mulq %rcx # p = a[1] * digit
michael@0 311 addq %r10, %rax
michael@0 312 adcq $0, %rdx # p += r[1]
michael@0 313 addq %r9, %rax
michael@0 314 adcq $0, %rdx # p += cy
michael@0 315 movq %rax, 8(%rdi) # r[1] = lo(p)
michael@0 316 movq %rdx, %r9 # cy = hi(p)
michael@0 317 decq %r8
michael@0 318 jz .L27
michael@0 319
michael@0 320 movq 16(%rsi), %rax
michael@0 321 movq 16(%rdi), %r10
michael@0 322 mulq %rcx # p = a[2] * digit
michael@0 323 addq %r10, %rax
michael@0 324 adcq $0, %rdx # p += r[2]
michael@0 325 addq %r9, %rax
michael@0 326 adcq $0, %rdx # p += cy
michael@0 327 movq %rax, 16(%rdi) # r[2] = lo(p)
michael@0 328 movq %rdx, %r9 # cy = hi(p)
michael@0 329 decq %r8
michael@0 330 jz .L27
michael@0 331
michael@0 332 movq 24(%rsi), %rax
michael@0 333 movq 24(%rdi), %r10
michael@0 334 mulq %rcx # p = a[3] * digit
michael@0 335 addq %r10, %rax
michael@0 336 adcq $0, %rdx # p += r[3]
michael@0 337 addq %r9, %rax
michael@0 338 adcq $0, %rdx # p += cy
michael@0 339 movq %rax, 24(%rdi) # r[3] = lo(p)
michael@0 340 movq %rdx, %r9 # cy = hi(p)
michael@0 341 decq %r8
michael@0 342 jz .L27
michael@0 343
michael@0 344 movq 32(%rsi), %rax
michael@0 345 movq 32(%rdi), %r10
michael@0 346 mulq %rcx # p = a[4] * digit
michael@0 347 addq %r10, %rax
michael@0 348 adcq $0, %rdx # p += r[4]
michael@0 349 addq %r9, %rax
michael@0 350 adcq $0, %rdx # p += cy
michael@0 351 movq %rax, 32(%rdi) # r[4] = lo(p)
michael@0 352 movq %rdx, %r9 # cy = hi(p)
michael@0 353 decq %r8
michael@0 354 jz .L27
michael@0 355
michael@0 356 movq 40(%rsi), %rax
michael@0 357 movq 40(%rdi), %r10
michael@0 358 mulq %rcx # p = a[5] * digit
michael@0 359 addq %r10, %rax
michael@0 360 adcq $0, %rdx # p += r[5]
michael@0 361 addq %r9, %rax
michael@0 362 adcq $0, %rdx # p += cy
michael@0 363 movq %rax, 40(%rdi) # r[5] = lo(p)
michael@0 364 movq %rdx, %r9 # cy = hi(p)
michael@0 365 decq %r8
michael@0 366 jz .L27
michael@0 367
michael@0 368 movq 48(%rsi), %rax
michael@0 369 movq 48(%rdi), %r10
michael@0 370 mulq %rcx # p = a[6] * digit
michael@0 371 addq %r10, %rax
michael@0 372 adcq $0, %rdx # p += r[6]
michael@0 373 addq %r9, %rax
michael@0 374 adcq $0, %rdx # p += cy
michael@0 375 movq %rax, 48(%rdi) # r[6] = lo(p)
michael@0 376 movq %rdx, %r9 # cy = hi(p)
michael@0 377 decq %r8
michael@0 378 jz .L27
michael@0 379
michael@0 380
michael@0 381 .L27:
michael@0 382 movq %r9, %rax
michael@0 383 ret
michael@0 384
michael@0 385 .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
michael@0 386
michael@0 387 # Magic indicating no need for an executable stack
michael@0 388 .section .note.GNU-stack, "", @progbits
michael@0 389 .previous

mercurial