security/nss/lib/freebl/mpi/mpi_amd64_gas.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/mpi/mpi_amd64_gas.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,389 @@
     1.4 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.7 +
     1.8 +
     1.9 +# ------------------------------------------------------------------------
    1.10 +#
    1.11 +#  Implementation of s_mpv_mul_set_vec which exploits
    1.12 +#  the 64X64->128 bit  unsigned multiply instruction.
    1.13 +#
    1.14 +# ------------------------------------------------------------------------
    1.15 +
    1.16 +# r = a * digit, r and a are vectors of length len
    1.17 +# returns the carry digit
    1.18 +# r and a are 64 bit aligned.
    1.19 +#
    1.20 +# uint64_t
    1.21 +# s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
    1.22 +#
    1.23 +
    1.24 +.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
    1.25 +
    1.26 +	xorq	%rax, %rax		# if (len == 0) return (0)
    1.27 +	testq	%rdx, %rdx
    1.28 +	jz	.L17
    1.29 +
    1.30 +	movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
    1.31 +	xorq	%r9, %r9		# cy = 0
    1.32 +
    1.33 +.L15:
    1.34 +	cmpq	$8, %r8			# 8 - len
    1.35 +	jb	.L16
    1.36 +	movq	0(%rsi), %rax		# rax = a[0]
    1.37 +	movq	8(%rsi), %r11		# prefetch a[1]
    1.38 +	mulq	%rcx			# p = a[0] * digit
    1.39 +	addq	%r9, %rax
    1.40 +	adcq	$0, %rdx		# p += cy
    1.41 +	movq	%rax, 0(%rdi)		# r[0] = lo(p)
    1.42 +	movq	%rdx, %r9		# cy = hi(p)
    1.43 +
    1.44 +	movq	%r11, %rax
    1.45 +	movq	16(%rsi), %r11		# prefetch a[2]
    1.46 +	mulq	%rcx			# p = a[1] * digit
    1.47 +	addq	%r9, %rax
    1.48 +	adcq	$0, %rdx		# p += cy
    1.49 +	movq	%rax, 8(%rdi)		# r[1] = lo(p)
    1.50 +	movq	%rdx, %r9		# cy = hi(p)
    1.51 +
    1.52 +	movq	%r11, %rax
    1.53 +	movq	24(%rsi), %r11		# prefetch a[3]
    1.54 +	mulq	%rcx			# p = a[2] * digit
    1.55 +	addq	%r9, %rax
    1.56 +	adcq	$0, %rdx		# p += cy
    1.57 +	movq	%rax, 16(%rdi)		# r[2] = lo(p)
    1.58 +	movq	%rdx, %r9		# cy = hi(p)
    1.59 +
    1.60 +	movq	%r11, %rax
    1.61 +	movq	32(%rsi), %r11		# prefetch a[4]
    1.62 +	mulq	%rcx			# p = a[3] * digit
    1.63 +	addq	%r9, %rax
    1.64 +	adcq	$0, %rdx		# p += cy
    1.65 +	movq	%rax, 24(%rdi)		# r[3] = lo(p)
    1.66 +	movq	%rdx, %r9		# cy = hi(p)
    1.67 +
    1.68 +	movq	%r11, %rax
    1.69 +	movq	40(%rsi), %r11		# prefetch a[5]
    1.70 +	mulq	%rcx			# p = a[4] * digit
    1.71 +	addq	%r9, %rax
    1.72 +	adcq	$0, %rdx		# p += cy
    1.73 +	movq	%rax, 32(%rdi)		# r[4] = lo(p)
    1.74 +	movq	%rdx, %r9		# cy = hi(p)
    1.75 +
    1.76 +	movq	%r11, %rax
    1.77 +	movq	48(%rsi), %r11		# prefetch a[6]
    1.78 +	mulq	%rcx			# p = a[5] * digit
    1.79 +	addq	%r9, %rax
    1.80 +	adcq	$0, %rdx		# p += cy
    1.81 +	movq	%rax, 40(%rdi)		# r[5] = lo(p)
    1.82 +	movq	%rdx, %r9		# cy = hi(p)
    1.83 +
    1.84 +	movq	%r11, %rax
    1.85 +	movq	56(%rsi), %r11		# prefetch a[7]
    1.86 +	mulq	%rcx			# p = a[6] * digit
    1.87 +	addq	%r9, %rax
    1.88 +	adcq	$0, %rdx		# p += cy
    1.89 +	movq	%rax, 48(%rdi)		# r[6] = lo(p)
    1.90 +	movq	%rdx, %r9		# cy = hi(p)
    1.91 +
    1.92 +	movq	%r11, %rax
    1.93 +	mulq	%rcx			# p = a[7] * digit
    1.94 +	addq	%r9, %rax
    1.95 +	adcq	$0, %rdx		# p += cy
    1.96 +	movq	%rax, 56(%rdi)		# r[7] = lo(p)
    1.97 +	movq	%rdx, %r9		# cy = hi(p)
    1.98 +
    1.99 +	addq	$64, %rsi
   1.100 +	addq	$64, %rdi
   1.101 +	subq	$8, %r8
   1.102 +
   1.103 +	jz	.L17
   1.104 +	jmp	.L15
   1.105 +
   1.106 +.L16:
   1.107 +	movq	0(%rsi), %rax
   1.108 +	mulq	%rcx			# p = a[0] * digit
   1.109 +	addq	%r9, %rax
   1.110 +	adcq	$0, %rdx		# p += cy
   1.111 +	movq	%rax, 0(%rdi)		# r[0] = lo(p)
   1.112 +	movq	%rdx, %r9		# cy = hi(p)
   1.113 +	decq	%r8
   1.114 +	jz	.L17
   1.115 +
   1.116 +	movq	8(%rsi), %rax
   1.117 +	mulq	%rcx			# p = a[1] * digit
   1.118 +	addq	%r9, %rax
   1.119 +	adcq	$0, %rdx		# p += cy
   1.120 +	movq	%rax, 8(%rdi)		# r[1] = lo(p)
   1.121 +	movq	%rdx, %r9		# cy = hi(p)
   1.122 +	decq	%r8
   1.123 +	jz	.L17
   1.124 +
   1.125 +	movq	16(%rsi), %rax
   1.126 +	mulq	%rcx			# p = a[2] * digit
   1.127 +	addq	%r9, %rax
   1.128 +	adcq	$0, %rdx		# p += cy
   1.129 +	movq	%rax, 16(%rdi)		# r[2] = lo(p)
   1.130 +	movq	%rdx, %r9		# cy = hi(p)
   1.131 +	decq	%r8
   1.132 +	jz	.L17
   1.133 +
   1.134 +	movq	24(%rsi), %rax
   1.135 +	mulq	%rcx			# p = a[3] * digit
   1.136 +	addq	%r9, %rax
   1.137 +	adcq	$0, %rdx		# p += cy
   1.138 +	movq	%rax, 24(%rdi)		# r[3] = lo(p)
   1.139 +	movq	%rdx, %r9		# cy = hi(p)
   1.140 +	decq	%r8
   1.141 +	jz	.L17
   1.142 +
   1.143 +	movq	32(%rsi), %rax
   1.144 +	mulq	%rcx			# p = a[4] * digit
   1.145 +	addq	%r9, %rax
   1.146 +	adcq	$0, %rdx		# p += cy
   1.147 +	movq	%rax, 32(%rdi)		# r[4] = lo(p)
   1.148 +	movq	%rdx, %r9		# cy = hi(p)
   1.149 +	decq	%r8
   1.150 +	jz	.L17
   1.151 +
   1.152 +	movq	40(%rsi), %rax
   1.153 +	mulq	%rcx			# p = a[5] * digit
   1.154 +	addq	%r9, %rax
   1.155 +	adcq	$0, %rdx		# p += cy
   1.156 +	movq	%rax, 40(%rdi)		# r[5] = lo(p)
   1.157 +	movq	%rdx, %r9		# cy = hi(p)
   1.158 +	decq	%r8
   1.159 +	jz	.L17
   1.160 +
   1.161 +	movq	48(%rsi), %rax
   1.162 +	mulq	%rcx			# p = a[6] * digit
   1.163 +	addq	%r9, %rax
   1.164 +	adcq	$0, %rdx		# p += cy
   1.165 +	movq	%rax, 48(%rdi)		# r[6] = lo(p)
   1.166 +	movq	%rdx, %r9		# cy = hi(p)
   1.167 +	decq	%r8
   1.168 +	jz	.L17
   1.169 +
   1.170 +
   1.171 +.L17:
   1.172 +	movq	%r9, %rax
   1.173 +	ret
   1.174 +
   1.175 +.size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
   1.176 +
   1.177 +# ------------------------------------------------------------------------
   1.178 +#
   1.179 +#  Implementation of s_mpv_mul_add_vec which exploits
   1.180 +#  the 64X64->128 bit  unsigned multiply instruction.
   1.181 +#
   1.182 +# ------------------------------------------------------------------------
   1.183 +
   1.184 +# r += a * digit, r and a are vectors of length len
   1.185 +# returns the carry digit
   1.186 +# r and a are 64 bit aligned.
   1.187 +#
   1.188 +# uint64_t
   1.189 +# s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
   1.190 +#
   1.191 +
   1.192 +.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
   1.193 +
   1.194 +	xorq	%rax, %rax		# if (len == 0) return (0)
   1.195 +	testq	%rdx, %rdx
   1.196 +	jz	.L27
   1.197 +
   1.198 +	movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
   1.199 +	xorq	%r9, %r9		# cy = 0
   1.200 +
   1.201 +.L25:
   1.202 +	cmpq	$8, %r8			# 8 - len
   1.203 +	jb	.L26
   1.204 +	movq	0(%rsi), %rax		# rax = a[0]
   1.205 +	movq	0(%rdi), %r10		# r10 = r[0]
   1.206 +	movq	8(%rsi), %r11		# prefetch a[1]
   1.207 +	mulq	%rcx			# p = a[0] * digit
   1.208 +	addq	%r10, %rax
   1.209 +	adcq	$0, %rdx		# p += r[0]
   1.210 +	movq	8(%rdi), %r10		# prefetch r[1]
   1.211 +	addq	%r9, %rax
   1.212 +	adcq	$0, %rdx		# p += cy
   1.213 +	movq	%rax, 0(%rdi)		# r[0] = lo(p)
   1.214 +	movq	%rdx, %r9		# cy = hi(p)
   1.215 +
   1.216 +	movq	%r11, %rax
   1.217 +	movq	16(%rsi), %r11		# prefetch a[2]
   1.218 +	mulq	%rcx			# p = a[1] * digit
   1.219 +	addq	%r10, %rax
   1.220 +	adcq	$0, %rdx		# p += r[1]
   1.221 +	movq	16(%rdi), %r10		# prefetch r[2]
   1.222 +	addq	%r9, %rax
   1.223 +	adcq	$0, %rdx		# p += cy
   1.224 +	movq	%rax, 8(%rdi)		# r[1] = lo(p)
   1.225 +	movq	%rdx, %r9		# cy = hi(p)
   1.226 +
   1.227 +	movq	%r11, %rax
   1.228 +	movq	24(%rsi), %r11		# prefetch a[3]
   1.229 +	mulq	%rcx			# p = a[2] * digit
   1.230 +	addq	%r10, %rax
   1.231 +	adcq	$0, %rdx		# p += r[2]
   1.232 +	movq	24(%rdi), %r10		# prefetch r[3]
   1.233 +	addq	%r9, %rax
   1.234 +	adcq	$0, %rdx		# p += cy
   1.235 +	movq	%rax, 16(%rdi)		# r[2] = lo(p)
   1.236 +	movq	%rdx, %r9		# cy = hi(p)
   1.237 +
   1.238 +	movq	%r11, %rax
   1.239 +	movq	32(%rsi), %r11		# prefetch a[4]
   1.240 +	mulq	%rcx			# p = a[3] * digit
   1.241 +	addq	%r10, %rax
   1.242 +	adcq	$0, %rdx		# p += r[3]
   1.243 +	movq	32(%rdi), %r10		# prefetch r[4]
   1.244 +	addq	%r9, %rax
   1.245 +	adcq	$0, %rdx		# p += cy
   1.246 +	movq	%rax, 24(%rdi)		# r[3] = lo(p)
   1.247 +	movq	%rdx, %r9		# cy = hi(p)
   1.248 +
   1.249 +	movq	%r11, %rax
   1.250 +	movq	40(%rsi), %r11		# prefetch a[5]
   1.251 +	mulq	%rcx			# p = a[4] * digit
   1.252 +	addq	%r10, %rax
   1.253 +	adcq	$0, %rdx		# p += r[4]
   1.254 +	movq	40(%rdi), %r10		# prefetch r[5]
   1.255 +	addq	%r9, %rax
   1.256 +	adcq	$0, %rdx		# p += cy
   1.257 +	movq	%rax, 32(%rdi)		# r[4] = lo(p)
   1.258 +	movq	%rdx, %r9		# cy = hi(p)
   1.259 +
   1.260 +	movq	%r11, %rax
   1.261 +	movq	48(%rsi), %r11		# prefetch a[6]
   1.262 +	mulq	%rcx			# p = a[5] * digit
   1.263 +	addq	%r10, %rax
   1.264 +	adcq	$0, %rdx		# p += r[5]
   1.265 +	movq	48(%rdi), %r10		# prefetch r[6]
   1.266 +	addq	%r9, %rax
   1.267 +	adcq	$0, %rdx		# p += cy
   1.268 +	movq	%rax, 40(%rdi)		# r[5] = lo(p)
   1.269 +	movq	%rdx, %r9		# cy = hi(p)
   1.270 +
   1.271 +	movq	%r11, %rax
   1.272 +	movq	56(%rsi), %r11		# prefetch a[7]
   1.273 +	mulq	%rcx			# p = a[6] * digit
   1.274 +	addq	%r10, %rax
   1.275 +	adcq	$0, %rdx		# p += r[6]
   1.276 +	movq	56(%rdi), %r10		# prefetch r[7]
   1.277 +	addq	%r9, %rax
   1.278 +	adcq	$0, %rdx		# p += cy
   1.279 +	movq	%rax, 48(%rdi)		# r[6] = lo(p)
   1.280 +	movq	%rdx, %r9		# cy = hi(p)
   1.281 +
   1.282 +	movq	%r11, %rax
   1.283 +	mulq	%rcx			# p = a[7] * digit
   1.284 +	addq	%r10, %rax
   1.285 +	adcq	$0, %rdx		# p += r[7]
   1.286 +	addq	%r9, %rax
   1.287 +	adcq	$0, %rdx		# p += cy
   1.288 +	movq	%rax, 56(%rdi)		# r[7] = lo(p)
   1.289 +	movq	%rdx, %r9		# cy = hi(p)
   1.290 +
   1.291 +	addq	$64, %rsi
   1.292 +	addq	$64, %rdi
   1.293 +	subq	$8, %r8
   1.294 +
   1.295 +	jz	.L27
   1.296 +	jmp	.L25
   1.297 +
   1.298 +.L26:
   1.299 +	movq	0(%rsi), %rax
   1.300 +	movq	0(%rdi), %r10
   1.301 +	mulq	%rcx			# p = a[0] * digit
   1.302 +	addq	%r10, %rax
   1.303 +	adcq	$0, %rdx		# p += r[0]
   1.304 +	addq	%r9, %rax
   1.305 +	adcq	$0, %rdx		# p += cy
   1.306 +	movq	%rax, 0(%rdi)		# r[0] = lo(p)
   1.307 +	movq	%rdx, %r9		# cy = hi(p)
   1.308 +	decq	%r8
   1.309 +	jz	.L27
   1.310 +
   1.311 +	movq	8(%rsi), %rax
   1.312 +	movq	8(%rdi), %r10
   1.313 +	mulq	%rcx			# p = a[1] * digit
   1.314 +	addq	%r10, %rax
   1.315 +	adcq	$0, %rdx		# p += r[1]
   1.316 +	addq	%r9, %rax
   1.317 +	adcq	$0, %rdx		# p += cy
   1.318 +	movq	%rax, 8(%rdi)		# r[1] = lo(p)
   1.319 +	movq	%rdx, %r9		# cy = hi(p)
   1.320 +	decq	%r8
   1.321 +	jz	.L27
   1.322 +
   1.323 +	movq	16(%rsi), %rax
   1.324 +	movq	16(%rdi), %r10
   1.325 +	mulq	%rcx			# p = a[2] * digit
   1.326 +	addq	%r10, %rax
   1.327 +	adcq	$0, %rdx		# p += r[2]
   1.328 +	addq	%r9, %rax
   1.329 +	adcq	$0, %rdx		# p += cy
   1.330 +	movq	%rax, 16(%rdi)		# r[2] = lo(p)
   1.331 +	movq	%rdx, %r9		# cy = hi(p)
   1.332 +	decq	%r8
   1.333 +	jz	.L27
   1.334 +
   1.335 +	movq	24(%rsi), %rax
   1.336 +	movq	24(%rdi), %r10
   1.337 +	mulq	%rcx			# p = a[3] * digit
   1.338 +	addq	%r10, %rax
   1.339 +	adcq	$0, %rdx		# p += r[3]
   1.340 +	addq	%r9, %rax
   1.341 +	adcq	$0, %rdx		# p += cy
   1.342 +	movq	%rax, 24(%rdi)		# r[3] = lo(p)
   1.343 +	movq	%rdx, %r9		# cy = hi(p)
   1.344 +	decq	%r8
   1.345 +	jz	.L27
   1.346 +
   1.347 +	movq	32(%rsi), %rax
   1.348 +	movq	32(%rdi), %r10
   1.349 +	mulq	%rcx			# p = a[4] * digit
   1.350 +	addq	%r10, %rax
   1.351 +	adcq	$0, %rdx		# p += r[4]
   1.352 +	addq	%r9, %rax
   1.353 +	adcq	$0, %rdx		# p += cy
   1.354 +	movq	%rax, 32(%rdi)		# r[4] = lo(p)
   1.355 +	movq	%rdx, %r9		# cy = hi(p)
   1.356 +	decq	%r8
   1.357 +	jz	.L27
   1.358 +
   1.359 +	movq	40(%rsi), %rax
   1.360 +	movq	40(%rdi), %r10
   1.361 +	mulq	%rcx			# p = a[5] * digit
   1.362 +	addq	%r10, %rax
   1.363 +	adcq	$0, %rdx		# p += r[5]
   1.364 +	addq	%r9, %rax
   1.365 +	adcq	$0, %rdx		# p += cy
   1.366 +	movq	%rax, 40(%rdi)		# r[5] = lo(p)
   1.367 +	movq	%rdx, %r9		# cy = hi(p)
   1.368 +	decq	%r8
   1.369 +	jz	.L27
   1.370 +
   1.371 +	movq	48(%rsi), %rax
   1.372 +	movq	48(%rdi), %r10
   1.373 +	mulq	%rcx			# p = a[6] * digit
   1.374 +	addq	%r10, %rax
   1.375 +	adcq	$0, %rdx		# p += r[6]
   1.376 +	addq	%r9, %rax
   1.377 +	adcq	$0, %rdx		# p += cy
   1.378 +	movq	%rax, 48(%rdi)		# r[6] = lo(p)
   1.379 +	movq	%rdx, %r9		# cy = hi(p)
   1.380 +	decq	%r8
   1.381 +	jz	.L27
   1.382 +
   1.383 +
   1.384 +.L27:
   1.385 +	movq	%r9, %rax
   1.386 +	ret
   1.387 +        
   1.388 +.size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
   1.389 +
   1.390 +# Magic indicating no need for an executable stack
   1.391 +.section .note.GNU-stack, "", @progbits
   1.392 +.previous

mercurial