michael@0: ; This Source Code Form is subject to the terms of the Mozilla Public michael@0: ; License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: ; file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: ; michael@0: ; This code is converted from mpi_amd64_gas.asm for MASM for x64. michael@0: ; michael@0: michael@0: ; ------------------------------------------------------------------------ michael@0: ; michael@0: ; Implementation of s_mpv_mul_set_vec which exploits michael@0: ; the 64X64->128 bit unsigned multiply instruction. michael@0: ; michael@0: ; ------------------------------------------------------------------------ michael@0: michael@0: ; r = a * digit, r and a are vectors of length len michael@0: ; returns the carry digit michael@0: ; r and a are 64 bit aligned. michael@0: ; michael@0: ; uint64_t michael@0: ; s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) michael@0: ; michael@0: michael@0: .CODE michael@0: michael@0: s_mpv_mul_set_vec64 PROC michael@0: michael@0: ; compatibilities for paramenter registers michael@0: ; michael@0: ; About GAS and MASM, the usage of parameter registers are different. michael@0: michael@0: push rdi michael@0: push rsi michael@0: michael@0: mov rdi, rcx michael@0: mov rsi, rdx michael@0: mov edx, r8d michael@0: mov rcx, r9 michael@0: michael@0: xor rax, rax michael@0: test rdx, rdx michael@0: jz L17 michael@0: mov r8, rdx michael@0: xor r9, r9 michael@0: michael@0: L15: michael@0: cmp r8, 8 michael@0: jb L16 michael@0: mov rax, [rsi] michael@0: mov r11, [8+rsi] michael@0: mul rcx michael@0: add rax, r9 michael@0: adc rdx, 0 michael@0: mov [0+rdi], rax michael@0: mov r9, rdx michael@0: mov rax,r11 michael@0: mov r11, [16+rsi] michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [8+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [24+rsi] michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [16+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [32+rsi] michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [24+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [40+rsi] michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [32+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [48+rsi] michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [40+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [56+rsi] michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [48+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [56+rdi],rax michael@0: mov r9,rdx michael@0: add rsi, 64 michael@0: add rdi, 64 michael@0: sub r8, 8 michael@0: jz L17 michael@0: jmp L15 michael@0: michael@0: L16: michael@0: mov rax, [0+rsi] michael@0: mul rcx michael@0: add rax, r9 michael@0: adc rdx,0 michael@0: mov [0+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L17 michael@0: mov rax, [8+rsi] michael@0: mul rcx michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [8+rdi], rax michael@0: mov r9, rdx michael@0: dec r8 michael@0: jz L17 michael@0: mov rax, [16+rsi] michael@0: mul rcx michael@0: add rax, r9 michael@0: adc rdx, 0 michael@0: mov [16+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L17 michael@0: mov rax, [24+rsi] michael@0: mul rcx michael@0: add rax, r9 michael@0: adc rdx, 0 michael@0: mov [24+rdi], rax michael@0: mov r9, rdx michael@0: dec r8 michael@0: jz L17 michael@0: mov rax, [32+rsi] michael@0: mul rcx michael@0: add rax, r9 michael@0: adc rdx, 0 michael@0: mov [32+rdi],rax michael@0: mov r9, rdx michael@0: dec r8 michael@0: jz L17 michael@0: mov rax, [40+rsi] michael@0: mul rcx michael@0: add rax, r9 michael@0: adc rdx, 0 michael@0: mov [40+rdi], rax michael@0: mov r9, rdx michael@0: dec r8 michael@0: jz L17 michael@0: mov rax, [48+rsi] michael@0: mul rcx michael@0: add rax, r9 michael@0: adc rdx, 0 michael@0: mov [48+rdi], rax michael@0: mov r9, rdx michael@0: dec r8 michael@0: jz L17 michael@0: michael@0: L17: michael@0: mov rax, r9 michael@0: pop rsi michael@0: pop rdi michael@0: ret michael@0: michael@0: s_mpv_mul_set_vec64 ENDP michael@0: michael@0: michael@0: ;------------------------------------------------------------------------ michael@0: ; michael@0: ; Implementation of s_mpv_mul_add_vec which exploits michael@0: ; the 64X64->128 bit unsigned multiply instruction. michael@0: ; michael@0: ;------------------------------------------------------------------------ michael@0: michael@0: ; r += a * digit, r and a are vectors of length len michael@0: ; returns the carry digit michael@0: ; r and a are 64 bit aligned. michael@0: ; michael@0: ; uint64_t michael@0: ; s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) michael@0: ; michael@0: michael@0: s_mpv_mul_add_vec64 PROC michael@0: michael@0: ; compatibilities for paramenter registers michael@0: ; michael@0: ; About GAS and MASM, the usage of parameter registers are different. michael@0: michael@0: push rdi michael@0: push rsi michael@0: michael@0: mov rdi, rcx michael@0: mov rsi, rdx michael@0: mov edx, r8d michael@0: mov rcx, r9 michael@0: michael@0: xor rax, rax michael@0: test rdx, rdx michael@0: jz L27 michael@0: mov r8, rdx michael@0: xor r9, r9 michael@0: michael@0: L25: michael@0: cmp r8, 8 michael@0: jb L26 michael@0: mov rax, [0+rsi] michael@0: mov r10, [0+rdi] michael@0: mov r11, [8+rsi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: mov r10, [8+rdi] michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [0+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [16+rsi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: mov r10, [16+rdi] michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [8+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [24+rsi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: mov r10, [24+rdi] michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [16+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [32+rsi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: mov r10, [32+rdi] michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [24+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [40+rsi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: mov r10, [40+rdi] michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [32+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [48+rsi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: mov r10, [48+rdi] michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [40+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mov r11, [56+rsi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: mov r10, [56+rdi] michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [48+rdi],rax michael@0: mov r9,rdx michael@0: mov rax,r11 michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [56+rdi],rax michael@0: mov r9,rdx michael@0: add rsi,64 michael@0: add rdi,64 michael@0: sub r8, 8 michael@0: jz L27 michael@0: jmp L25 michael@0: michael@0: L26: michael@0: mov rax, [0+rsi] michael@0: mov r10, [0+rdi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [0+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L27 michael@0: mov rax, [8+rsi] michael@0: mov r10, [8+rdi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [8+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L27 michael@0: mov rax, [16+rsi] michael@0: mov r10, [16+rdi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [16+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L27 michael@0: mov rax, [24+rsi] michael@0: mov r10, [24+rdi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [24+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L27 michael@0: mov rax, [32+rsi] michael@0: mov r10, [32+rdi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [32+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L27 michael@0: mov rax, [40+rsi] michael@0: mov r10, [40+rdi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax,r9 michael@0: adc rdx,0 michael@0: mov [40+rdi],rax michael@0: mov r9,rdx michael@0: dec r8 michael@0: jz L27 michael@0: mov rax, [48+rsi] michael@0: mov r10, [48+rdi] michael@0: mul rcx michael@0: add rax,r10 michael@0: adc rdx,0 michael@0: add rax, r9 michael@0: adc rdx, 0 michael@0: mov [48+rdi], rax michael@0: mov r9, rdx michael@0: dec r8 michael@0: jz L27 michael@0: michael@0: L27: michael@0: mov rax, r9 michael@0: michael@0: pop rsi michael@0: pop rdi michael@0: ret michael@0: michael@0: s_mpv_mul_add_vec64 ENDP michael@0: michael@0: END