security/nss/lib/freebl/mpi/mpi_amd64_gas.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 # This Source Code Form is subject to the terms of the Mozilla Public
     2 # License, v. 2.0. If a copy of the MPL was not distributed with this
     3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     6 # ------------------------------------------------------------------------
     7 #
     8 #  Implementation of s_mpv_mul_set_vec which exploits
     9 #  the 64X64->128 bit  unsigned multiply instruction.
    10 #
    11 # ------------------------------------------------------------------------
    13 # r = a * digit, r and a are vectors of length len
    14 # returns the carry digit
    15 # r and a are 64 bit aligned.
    16 #
    17 # uint64_t
    18 # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
    19 #
    21 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
    23 	xorq	%rax, %rax		# if (len == 0) return (0)
    24 	testq	%rdx, %rdx
    25 	jz	.L17
    27 	movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
    28 	xorq	%r9, %r9		# cy = 0
    30 .L15:
    31 	cmpq	$8, %r8			# 8 - len
    32 	jb	.L16
    33 	movq	0(%rsi), %rax		# rax = a[0]
    34 	movq	8(%rsi), %r11		# prefetch a[1]
    35 	mulq	%rcx			# p = a[0] * digit
    36 	addq	%r9, %rax
    37 	adcq	$0, %rdx		# p += cy
    38 	movq	%rax, 0(%rdi)		# r[0] = lo(p)
    39 	movq	%rdx, %r9		# cy = hi(p)
    41 	movq	%r11, %rax
    42 	movq	16(%rsi), %r11		# prefetch a[2]
    43 	mulq	%rcx			# p = a[1] * digit
    44 	addq	%r9, %rax
    45 	adcq	$0, %rdx		# p += cy
    46 	movq	%rax, 8(%rdi)		# r[1] = lo(p)
    47 	movq	%rdx, %r9		# cy = hi(p)
    49 	movq	%r11, %rax
    50 	movq	24(%rsi), %r11		# prefetch a[3]
    51 	mulq	%rcx			# p = a[2] * digit
    52 	addq	%r9, %rax
    53 	adcq	$0, %rdx		# p += cy
    54 	movq	%rax, 16(%rdi)		# r[2] = lo(p)
    55 	movq	%rdx, %r9		# cy = hi(p)
    57 	movq	%r11, %rax
    58 	movq	32(%rsi), %r11		# prefetch a[4]
    59 	mulq	%rcx			# p = a[3] * digit
    60 	addq	%r9, %rax
    61 	adcq	$0, %rdx		# p += cy
    62 	movq	%rax, 24(%rdi)		# r[3] = lo(p)
    63 	movq	%rdx, %r9		# cy = hi(p)
    65 	movq	%r11, %rax
    66 	movq	40(%rsi), %r11		# prefetch a[5]
    67 	mulq	%rcx			# p = a[4] * digit
    68 	addq	%r9, %rax
    69 	adcq	$0, %rdx		# p += cy
    70 	movq	%rax, 32(%rdi)		# r[4] = lo(p)
    71 	movq	%rdx, %r9		# cy = hi(p)
    73 	movq	%r11, %rax
    74 	movq	48(%rsi), %r11		# prefetch a[6]
    75 	mulq	%rcx			# p = a[5] * digit
    76 	addq	%r9, %rax
    77 	adcq	$0, %rdx		# p += cy
    78 	movq	%rax, 40(%rdi)		# r[5] = lo(p)
    79 	movq	%rdx, %r9		# cy = hi(p)
    81 	movq	%r11, %rax
    82 	movq	56(%rsi), %r11		# prefetch a[7]
    83 	mulq	%rcx			# p = a[6] * digit
    84 	addq	%r9, %rax
    85 	adcq	$0, %rdx		# p += cy
    86 	movq	%rax, 48(%rdi)		# r[6] = lo(p)
    87 	movq	%rdx, %r9		# cy = hi(p)
    89 	movq	%r11, %rax
    90 	mulq	%rcx			# p = a[7] * digit
    91 	addq	%r9, %rax
    92 	adcq	$0, %rdx		# p += cy
    93 	movq	%rax, 56(%rdi)		# r[7] = lo(p)
    94 	movq	%rdx, %r9		# cy = hi(p)
    96 	addq	$64, %rsi
    97 	addq	$64, %rdi
    98 	subq	$8, %r8
   100 	jz	.L17
   101 	jmp	.L15
   103 .L16:
   104 	movq	0(%rsi), %rax
   105 	mulq	%rcx			# p = a[0] * digit
   106 	addq	%r9, %rax
   107 	adcq	$0, %rdx		# p += cy
   108 	movq	%rax, 0(%rdi)		# r[0] = lo(p)
   109 	movq	%rdx, %r9		# cy = hi(p)
   110 	decq	%r8
   111 	jz	.L17
   113 	movq	8(%rsi), %rax
   114 	mulq	%rcx			# p = a[1] * digit
   115 	addq	%r9, %rax
   116 	adcq	$0, %rdx		# p += cy
   117 	movq	%rax, 8(%rdi)		# r[1] = lo(p)
   118 	movq	%rdx, %r9		# cy = hi(p)
   119 	decq	%r8
   120 	jz	.L17
   122 	movq	16(%rsi), %rax
   123 	mulq	%rcx			# p = a[2] * digit
   124 	addq	%r9, %rax
   125 	adcq	$0, %rdx		# p += cy
   126 	movq	%rax, 16(%rdi)		# r[2] = lo(p)
   127 	movq	%rdx, %r9		# cy = hi(p)
   128 	decq	%r8
   129 	jz	.L17
   131 	movq	24(%rsi), %rax
   132 	mulq	%rcx			# p = a[3] * digit
   133 	addq	%r9, %rax
   134 	adcq	$0, %rdx		# p += cy
   135 	movq	%rax, 24(%rdi)		# r[3] = lo(p)
   136 	movq	%rdx, %r9		# cy = hi(p)
   137 	decq	%r8
   138 	jz	.L17
   140 	movq	32(%rsi), %rax
   141 	mulq	%rcx			# p = a[4] * digit
   142 	addq	%r9, %rax
   143 	adcq	$0, %rdx		# p += cy
   144 	movq	%rax, 32(%rdi)		# r[4] = lo(p)
   145 	movq	%rdx, %r9		# cy = hi(p)
   146 	decq	%r8
   147 	jz	.L17
   149 	movq	40(%rsi), %rax
   150 	mulq	%rcx			# p = a[5] * digit
   151 	addq	%r9, %rax
   152 	adcq	$0, %rdx		# p += cy
   153 	movq	%rax, 40(%rdi)		# r[5] = lo(p)
   154 	movq	%rdx, %r9		# cy = hi(p)
   155 	decq	%r8
   156 	jz	.L17
   158 	movq	48(%rsi), %rax
   159 	mulq	%rcx			# p = a[6] * digit
   160 	addq	%r9, %rax
   161 	adcq	$0, %rdx		# p += cy
   162 	movq	%rax, 48(%rdi)		# r[6] = lo(p)
   163 	movq	%rdx, %r9		# cy = hi(p)
   164 	decq	%r8
   165 	jz	.L17
   168 .L17:
   169 	movq	%r9, %rax
   170 	ret
   172 .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
   174 # ------------------------------------------------------------------------
   175 #
   176 #  Implementation of s_mpv_mul_add_vec which exploits
   177 #  the 64X64->128 bit  unsigned multiply instruction.
   178 #
   179 # ------------------------------------------------------------------------
   181 # r += a * digit, r and a are vectors of length len
   182 # returns the carry digit
   183 # r and a are 64 bit aligned.
   184 #
   185 # uint64_t
   186 # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
   187 #
   189 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
   191 	xorq	%rax, %rax		# if (len == 0) return (0)
   192 	testq	%rdx, %rdx
   193 	jz	.L27
   195 	movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
   196 	xorq	%r9, %r9		# cy = 0
   198 .L25:
   199 	cmpq	$8, %r8			# 8 - len
   200 	jb	.L26
   201 	movq	0(%rsi), %rax		# rax = a[0]
   202 	movq	0(%rdi), %r10		# r10 = r[0]
   203 	movq	8(%rsi), %r11		# prefetch a[1]
   204 	mulq	%rcx			# p = a[0] * digit
   205 	addq	%r10, %rax
   206 	adcq	$0, %rdx		# p += r[0]
   207 	movq	8(%rdi), %r10		# prefetch r[1]
   208 	addq	%r9, %rax
   209 	adcq	$0, %rdx		# p += cy
   210 	movq	%rax, 0(%rdi)		# r[0] = lo(p)
   211 	movq	%rdx, %r9		# cy = hi(p)
   213 	movq	%r11, %rax
   214 	movq	16(%rsi), %r11		# prefetch a[2]
   215 	mulq	%rcx			# p = a[1] * digit
   216 	addq	%r10, %rax
   217 	adcq	$0, %rdx		# p += r[1]
   218 	movq	16(%rdi), %r10		# prefetch r[2]
   219 	addq	%r9, %rax
   220 	adcq	$0, %rdx		# p += cy
   221 	movq	%rax, 8(%rdi)		# r[1] = lo(p)
   222 	movq	%rdx, %r9		# cy = hi(p)
   224 	movq	%r11, %rax
   225 	movq	24(%rsi), %r11		# prefetch a[3]
   226 	mulq	%rcx			# p = a[2] * digit
   227 	addq	%r10, %rax
   228 	adcq	$0, %rdx		# p += r[2]
   229 	movq	24(%rdi), %r10		# prefetch r[3]
   230 	addq	%r9, %rax
   231 	adcq	$0, %rdx		# p += cy
   232 	movq	%rax, 16(%rdi)		# r[2] = lo(p)
   233 	movq	%rdx, %r9		# cy = hi(p)
   235 	movq	%r11, %rax
   236 	movq	32(%rsi), %r11		# prefetch a[4]
   237 	mulq	%rcx			# p = a[3] * digit
   238 	addq	%r10, %rax
   239 	adcq	$0, %rdx		# p += r[3]
   240 	movq	32(%rdi), %r10		# prefetch r[4]
   241 	addq	%r9, %rax
   242 	adcq	$0, %rdx		# p += cy
   243 	movq	%rax, 24(%rdi)		# r[3] = lo(p)
   244 	movq	%rdx, %r9		# cy = hi(p)
   246 	movq	%r11, %rax
   247 	movq	40(%rsi), %r11		# prefetch a[5]
   248 	mulq	%rcx			# p = a[4] * digit
   249 	addq	%r10, %rax
   250 	adcq	$0, %rdx		# p += r[4]
   251 	movq	40(%rdi), %r10		# prefetch r[5]
   252 	addq	%r9, %rax
   253 	adcq	$0, %rdx		# p += cy
   254 	movq	%rax, 32(%rdi)		# r[4] = lo(p)
   255 	movq	%rdx, %r9		# cy = hi(p)
   257 	movq	%r11, %rax
   258 	movq	48(%rsi), %r11		# prefetch a[6]
   259 	mulq	%rcx			# p = a[5] * digit
   260 	addq	%r10, %rax
   261 	adcq	$0, %rdx		# p += r[5]
   262 	movq	48(%rdi), %r10		# prefetch r[6]
   263 	addq	%r9, %rax
   264 	adcq	$0, %rdx		# p += cy
   265 	movq	%rax, 40(%rdi)		# r[5] = lo(p)
   266 	movq	%rdx, %r9		# cy = hi(p)
   268 	movq	%r11, %rax
   269 	movq	56(%rsi), %r11		# prefetch a[7]
   270 	mulq	%rcx			# p = a[6] * digit
   271 	addq	%r10, %rax
   272 	adcq	$0, %rdx		# p += r[6]
   273 	movq	56(%rdi), %r10		# prefetch r[7]
   274 	addq	%r9, %rax
   275 	adcq	$0, %rdx		# p += cy
   276 	movq	%rax, 48(%rdi)		# r[6] = lo(p)
   277 	movq	%rdx, %r9		# cy = hi(p)
   279 	movq	%r11, %rax
   280 	mulq	%rcx			# p = a[7] * digit
   281 	addq	%r10, %rax
   282 	adcq	$0, %rdx		# p += r[7]
   283 	addq	%r9, %rax
   284 	adcq	$0, %rdx		# p += cy
   285 	movq	%rax, 56(%rdi)		# r[7] = lo(p)
   286 	movq	%rdx, %r9		# cy = hi(p)
   288 	addq	$64, %rsi
   289 	addq	$64, %rdi
   290 	subq	$8, %r8
   292 	jz	.L27
   293 	jmp	.L25
   295 .L26:
   296 	movq	0(%rsi), %rax
   297 	movq	0(%rdi), %r10
   298 	mulq	%rcx			# p = a[0] * digit
   299 	addq	%r10, %rax
   300 	adcq	$0, %rdx		# p += r[0]
   301 	addq	%r9, %rax
   302 	adcq	$0, %rdx		# p += cy
   303 	movq	%rax, 0(%rdi)		# r[0] = lo(p)
   304 	movq	%rdx, %r9		# cy = hi(p)
   305 	decq	%r8
   306 	jz	.L27
   308 	movq	8(%rsi), %rax
   309 	movq	8(%rdi), %r10
   310 	mulq	%rcx			# p = a[1] * digit
   311 	addq	%r10, %rax
   312 	adcq	$0, %rdx		# p += r[1]
   313 	addq	%r9, %rax
   314 	adcq	$0, %rdx		# p += cy
   315 	movq	%rax, 8(%rdi)		# r[1] = lo(p)
   316 	movq	%rdx, %r9		# cy = hi(p)
   317 	decq	%r8
   318 	jz	.L27
   320 	movq	16(%rsi), %rax
   321 	movq	16(%rdi), %r10
   322 	mulq	%rcx			# p = a[2] * digit
   323 	addq	%r10, %rax
   324 	adcq	$0, %rdx		# p += r[2]
   325 	addq	%r9, %rax
   326 	adcq	$0, %rdx		# p += cy
   327 	movq	%rax, 16(%rdi)		# r[2] = lo(p)
   328 	movq	%rdx, %r9		# cy = hi(p)
   329 	decq	%r8
   330 	jz	.L27
   332 	movq	24(%rsi), %rax
   333 	movq	24(%rdi), %r10
   334 	mulq	%rcx			# p = a[3] * digit
   335 	addq	%r10, %rax
   336 	adcq	$0, %rdx		# p += r[3]
   337 	addq	%r9, %rax
   338 	adcq	$0, %rdx		# p += cy
   339 	movq	%rax, 24(%rdi)		# r[3] = lo(p)
   340 	movq	%rdx, %r9		# cy = hi(p)
   341 	decq	%r8
   342 	jz	.L27
   344 	movq	32(%rsi), %rax
   345 	movq	32(%rdi), %r10
   346 	mulq	%rcx			# p = a[4] * digit
   347 	addq	%r10, %rax
   348 	adcq	$0, %rdx		# p += r[4]
   349 	addq	%r9, %rax
   350 	adcq	$0, %rdx		# p += cy
   351 	movq	%rax, 32(%rdi)		# r[4] = lo(p)
   352 	movq	%rdx, %r9		# cy = hi(p)
   353 	decq	%r8
   354 	jz	.L27
   356 	movq	40(%rsi), %rax
   357 	movq	40(%rdi), %r10
   358 	mulq	%rcx			# p = a[5] * digit
   359 	addq	%r10, %rax
   360 	adcq	$0, %rdx		# p += r[5]
   361 	addq	%r9, %rax
   362 	adcq	$0, %rdx		# p += cy
   363 	movq	%rax, 40(%rdi)		# r[5] = lo(p)
   364 	movq	%rdx, %r9		# cy = hi(p)
   365 	decq	%r8
   366 	jz	.L27
   368 	movq	48(%rsi), %rax
   369 	movq	48(%rdi), %r10
   370 	mulq	%rcx			# p = a[6] * digit
   371 	addq	%r10, %rax
   372 	adcq	$0, %rdx		# p += r[6]
   373 	addq	%r9, %rax
   374 	adcq	$0, %rdx		# p += cy
   375 	movq	%rax, 48(%rdi)		# r[6] = lo(p)
   376 	movq	%rdx, %r9		# cy = hi(p)
   377 	decq	%r8
   378 	jz	.L27
   381 .L27:
   382 	movq	%r9, %rax
   383 	ret
   385 .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
   387 # Magic indicating no need for an executable stack
   388 .section .note.GNU-stack, "", @progbits
   389 .previous

mercurial