michael@0: / This Source Code Form is subject to the terms of the Mozilla Public michael@0: / License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: / file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: michael@0: / ------------------------------------------------------------------------ michael@0: / michael@0: / Implementation of s_mpv_mul_set_vec which exploits michael@0: / the 64X64->128 bit unsigned multiply instruction. michael@0: / michael@0: / ------------------------------------------------------------------------ michael@0: michael@0: / r = a * digit, r and a are vectors of length len michael@0: / returns the carry digit michael@0: / r and a are 64 bit aligned. michael@0: / michael@0: / uint64_t michael@0: / s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) michael@0: / michael@0: michael@0: .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: michael@0: michael@0: xorq %rax, %rax / if (len == 0) return (0) michael@0: testq %rdx, %rdx michael@0: jz .L17 michael@0: michael@0: movq %rdx, %r8 / Use r8 for len; %rdx is used by mul michael@0: xorq %r9, %r9 / cy = 0 michael@0: michael@0: .L15: michael@0: cmpq $8, %r8 / 8 - len michael@0: jb .L16 michael@0: movq 0(%rsi), %rax / rax = a[0] michael@0: movq 8(%rsi), %r11 / prefetch a[1] michael@0: mulq %rcx / p = a[0] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 0(%rdi) / r[0] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 16(%rsi), %r11 / prefetch a[2] michael@0: mulq %rcx / p = a[1] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 8(%rdi) / r[1] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 24(%rsi), %r11 / prefetch a[3] michael@0: mulq %rcx / p = a[2] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 16(%rdi) / r[2] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 32(%rsi), %r11 / prefetch a[4] michael@0: mulq %rcx / p = a[3] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 24(%rdi) / r[3] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 40(%rsi), %r11 / prefetch a[5] michael@0: mulq %rcx / p = a[4] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 32(%rdi) / r[4] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 48(%rsi), %r11 / prefetch a[6] michael@0: mulq %rcx / p = a[5] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 40(%rdi) / r[5] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 56(%rsi), %r11 / prefetch a[7] michael@0: mulq %rcx / p = a[6] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 48(%rdi) / r[6] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: mulq %rcx / p = a[7] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 56(%rdi) / r[7] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: addq $64, %rsi michael@0: addq $64, %rdi michael@0: subq $8, %r8 michael@0: michael@0: jz .L17 michael@0: jmp .L15 michael@0: michael@0: .L16: michael@0: movq 0(%rsi), %rax michael@0: mulq %rcx / p = a[0] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 0(%rdi) / r[0] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L17 michael@0: michael@0: movq 8(%rsi), %rax michael@0: mulq %rcx / p = a[1] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 8(%rdi) / r[1] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L17 michael@0: michael@0: movq 16(%rsi), %rax michael@0: mulq %rcx / p = a[2] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 16(%rdi) / r[2] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L17 michael@0: michael@0: movq 24(%rsi), %rax michael@0: mulq %rcx / p = a[3] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 24(%rdi) / r[3] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L17 michael@0: michael@0: movq 32(%rsi), %rax michael@0: mulq %rcx / p = a[4] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 32(%rdi) / r[4] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L17 michael@0: michael@0: movq 40(%rsi), %rax michael@0: mulq %rcx / p = a[5] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 40(%rdi) / r[5] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L17 michael@0: michael@0: movq 48(%rsi), %rax michael@0: mulq %rcx / p = a[6] * digit michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 48(%rdi) / r[6] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L17 michael@0: michael@0: michael@0: .L17: michael@0: movq %r9, %rax michael@0: ret michael@0: michael@0: .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64 michael@0: michael@0: / ------------------------------------------------------------------------ michael@0: / michael@0: / Implementation of s_mpv_mul_add_vec which exploits michael@0: / the 64X64->128 bit unsigned multiply instruction. michael@0: / michael@0: / ------------------------------------------------------------------------ michael@0: michael@0: / r += a * digit, r and a are vectors of length len michael@0: / returns the carry digit michael@0: / r and a are 64 bit aligned. michael@0: / michael@0: / uint64_t michael@0: / s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) michael@0: / michael@0: michael@0: .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: michael@0: michael@0: xorq %rax, %rax / if (len == 0) return (0) michael@0: testq %rdx, %rdx michael@0: jz .L27 michael@0: michael@0: movq %rdx, %r8 / Use r8 for len; %rdx is used by mul michael@0: xorq %r9, %r9 / cy = 0 michael@0: michael@0: .L25: michael@0: cmpq $8, %r8 / 8 - len michael@0: jb .L26 michael@0: movq 0(%rsi), %rax / rax = a[0] michael@0: movq 0(%rdi), %r10 / r10 = r[0] michael@0: movq 8(%rsi), %r11 / prefetch a[1] michael@0: mulq %rcx / p = a[0] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[0] michael@0: movq 8(%rdi), %r10 / prefetch r[1] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 0(%rdi) / r[0] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 16(%rsi), %r11 / prefetch a[2] michael@0: mulq %rcx / p = a[1] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[1] michael@0: movq 16(%rdi), %r10 / prefetch r[2] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 8(%rdi) / r[1] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 24(%rsi), %r11 / prefetch a[3] michael@0: mulq %rcx / p = a[2] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[2] michael@0: movq 24(%rdi), %r10 / prefetch r[3] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 16(%rdi) / r[2] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 32(%rsi), %r11 / prefetch a[4] michael@0: mulq %rcx / p = a[3] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[3] michael@0: movq 32(%rdi), %r10 / prefetch r[4] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 24(%rdi) / r[3] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 40(%rsi), %r11 / prefetch a[5] michael@0: mulq %rcx / p = a[4] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[4] michael@0: movq 40(%rdi), %r10 / prefetch r[5] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 32(%rdi) / r[4] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 48(%rsi), %r11 / prefetch a[6] michael@0: mulq %rcx / p = a[5] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[5] michael@0: movq 48(%rdi), %r10 / prefetch r[6] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 40(%rdi) / r[5] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: movq 56(%rsi), %r11 / prefetch a[7] michael@0: mulq %rcx / p = a[6] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[6] michael@0: movq 56(%rdi), %r10 / prefetch r[7] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 48(%rdi) / r[6] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: movq %r11, %rax michael@0: mulq %rcx / p = a[7] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[7] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 56(%rdi) / r[7] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: michael@0: addq $64, %rsi michael@0: addq $64, %rdi michael@0: subq $8, %r8 michael@0: michael@0: jz .L27 michael@0: jmp .L25 michael@0: michael@0: .L26: michael@0: movq 0(%rsi), %rax michael@0: movq 0(%rdi), %r10 michael@0: mulq %rcx / p = a[0] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[0] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 0(%rdi) / r[0] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L27 michael@0: michael@0: movq 8(%rsi), %rax michael@0: movq 8(%rdi), %r10 michael@0: mulq %rcx / p = a[1] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[1] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 8(%rdi) / r[1] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L27 michael@0: michael@0: movq 16(%rsi), %rax michael@0: movq 16(%rdi), %r10 michael@0: mulq %rcx / p = a[2] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[2] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 16(%rdi) / r[2] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L27 michael@0: michael@0: movq 24(%rsi), %rax michael@0: movq 24(%rdi), %r10 michael@0: mulq %rcx / p = a[3] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[3] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 24(%rdi) / r[3] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L27 michael@0: michael@0: movq 32(%rsi), %rax michael@0: movq 32(%rdi), %r10 michael@0: mulq %rcx / p = a[4] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[4] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 32(%rdi) / r[4] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L27 michael@0: michael@0: movq 40(%rsi), %rax michael@0: movq 40(%rdi), %r10 michael@0: mulq %rcx / p = a[5] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[5] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 40(%rdi) / r[5] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L27 michael@0: michael@0: movq 48(%rsi), %rax michael@0: movq 48(%rdi), %r10 michael@0: mulq %rcx / p = a[6] * digit michael@0: addq %r10, %rax michael@0: adcq $0, %rdx / p += r[6] michael@0: addq %r9, %rax michael@0: adcq $0, %rdx / p += cy michael@0: movq %rax, 48(%rdi) / r[6] = lo(p) michael@0: movq %rdx, %r9 / cy = hi(p) michael@0: decq %r8 michael@0: jz .L27 michael@0: michael@0: michael@0: .L27: michael@0: movq %r9, %rax michael@0: ret michael@0: michael@0: .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64