mirror of
https://github.com/signalwire/freeswitch.git
synced 2025-02-05 02:34:18 +00:00
d2edcad66e
Thanks to Phil Zimmermann for the code and for the license exception we needed to include it. There remains some build system integration work to be done before this code will build properly in the FreeSWITCH tree.
395 lines
9.2 KiB
ArmAsm
395 lines
9.2 KiB
ArmAsm
### Copyright (c) 1995, Colin Plumb.
|
|
### For licensing and other legal details, see the file legal.c.
|
|
###
|
|
### Assembly primitives for bignum library, 80386 family, 32-bit code.
|
|
###
|
|
### Several primitives are included here. Only lbnMulAdd1 is *really*
|
|
### critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
|
|
### easy to write as well, so they are included here as well.
|
|
### lbnDiv21 and lbnModQ are so easy to write that they're included, too.
|
|
###
|
|
### All functions here are for 32-bit flat mode. I.e. near code and
|
|
### near data, although the near offsets are 32 bits.
|
|
### Preserved registers are esp, ebp, esi, edi and ebx. That last
|
|
### is needed by ELF for PIC, and differs from the IBM PC calling
|
|
### convention.
|
|
|
|
# Different assemblers have different conventions here
|
|
align4=4 # could be 2 or 4
|
|
align8=8 # could be 3 or 8
|
|
align16=16 # cound be 4 or 16
|
|
|
|
|
|
.text
|
|
|
|
# We declare each symbol with two names, to deal with ELF/a.out variances.
|
|
.globl lbnMulN1_32
|
|
.globl _lbnMulN1_32
|
|
.globl lbnMulAdd1_32
|
|
.globl _lbnMulAdd1_32
|
|
.globl lbnMulSub1_32
|
|
.globl _lbnMulSub1_32
|
|
.globl lbnDiv21_32
|
|
.globl _lbnDiv21_32
|
|
.globl lbnModQ_32
|
|
.globl _lbnModQ_32
|
|
|
|
## Register usage:
|
|
## %eax - low half of product
|
|
## %ebx - carry to next iteration
|
|
## %ecx - multiplier (k)
|
|
## %edx - high half of product
|
|
## %esi - source pointer
|
|
## %edi - dest pointer
|
|
## %ebp - loop counter
|
|
##
|
|
## Stack frame:
|
|
## +--------+ %esp+20 %esp+24 %esp+28 %esp+32 %esp+36
|
|
## | k |
|
|
## +--------+ %esp+16 %esp+20 %esp+24 %esp+28 %esp+32
|
|
## | len |
|
|
## +--------+ %esp+12 %esp+16 %esp+20 %esp+24 %esp+28
|
|
## | in |
|
|
## +--------+ %esp+8 %esp+12 %esp+16 %esp+20 %esp+24
|
|
## | out |
|
|
## +--------+ %esp+4 %esp+8 %esp+12 %esp+16 %esp+20
|
|
## | return |
|
|
## +--------+ %esp %esp+4 %esp+8 %esp+12 %esp+16
|
|
## | %esi |
|
|
## +--------+ %esp %esp+4 %esp+8 %esp+12
|
|
## | %ebp |
|
|
## +--------+ %esp %esp+4 %esp+8
|
|
## | %ebx |
|
|
## +--------+ %esp %esp+4
|
|
## | %edi |
|
|
## +--------+ %esp
|
|
|
|
.align align16
|
|
lbnMulN1_32:
|
|
_lbnMulN1_32:
|
|
pushl %esi # U
|
|
movl 12(%esp),%esi # V load in
|
|
pushl %ebp # U
|
|
movl 20(%esp),%ebp # V load len
|
|
pushl %ebx # U
|
|
movl 28(%esp),%ecx # V load k
|
|
pushl %edi # U
|
|
movl 20(%esp),%edi # V load out
|
|
|
|
## First multiply step has no carry in.
|
|
movl (%esi),%eax # V
|
|
leal -4(,%ebp,4),%ebx # U loop unrolling
|
|
mull %ecx # NP first multiply
|
|
movl %eax,(%edi) # U
|
|
andl $12,%ebx # V loop unrolling
|
|
|
|
addl %ebx,%esi # U loop unrolling
|
|
addl %ebx,%edi # V loop unrolling
|
|
|
|
jmp *m32_jumptable(%ebx) # NP loop unrolling
|
|
|
|
.align align4
|
|
m32_jumptable:
|
|
.long m32_case0
|
|
.long m32_case1
|
|
.long m32_case2
|
|
.long m32_case3
|
|
|
|
nop
|
|
.align align8
|
|
nop
|
|
nop
|
|
nop # Get loop nicely aligned
|
|
|
|
m32_case0:
|
|
subl $4,%ebp # U
|
|
jbe m32_done # V
|
|
|
|
m32_loop:
|
|
movl 4(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
addl $16,%esi # U
|
|
addl $16,%edi # V
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
adcl $0,%edx # U
|
|
movl %eax,-12(%edi) # V
|
|
m32_case3:
|
|
movl -8(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
adcl $0,%edx # U
|
|
movl %eax,-8(%edi) # V
|
|
m32_case2:
|
|
movl -4(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
adcl $0,%edx # U
|
|
movl %eax,-4(%edi) # V
|
|
m32_case1:
|
|
movl (%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
adcl $0,%edx # U
|
|
movl %eax,(%edi) # V
|
|
|
|
subl $4,%ebp # U
|
|
ja m32_loop # V
|
|
|
|
m32_done:
|
|
movl %edx,4(%edi) # U
|
|
popl %edi # V
|
|
popl %ebx # U
|
|
popl %ebp # V
|
|
popl %esi # U
|
|
ret # NP
|
|
|
|
|
|
.align align16
|
|
lbnMulAdd1_32:
|
|
_lbnMulAdd1_32:
|
|
|
|
pushl %esi # U
|
|
movl 12(%esp),%esi # V load in
|
|
pushl %edi # U
|
|
movl 12(%esp),%edi # V load out
|
|
pushl %ebp # U
|
|
movl 24(%esp),%ebp # V load len
|
|
pushl %ebx # U
|
|
movl 32(%esp),%ecx # V load k
|
|
|
|
## First multiply step has no carry in.
|
|
movl (%esi),%eax # V
|
|
movl (%edi),%ebx # U
|
|
mull %ecx # NP first multiply
|
|
addl %eax,%ebx # U
|
|
leal -4(,%ebp,4),%eax # V loop unrolling
|
|
adcl $0,%edx # U
|
|
andl $12,%eax # V loop unrolling
|
|
movl %ebx,(%edi) # U
|
|
|
|
addl %eax,%esi # V loop unrolling
|
|
addl %eax,%edi # U loop unrolling
|
|
|
|
jmp *ma32_jumptable(%eax) # NP loop unrolling
|
|
|
|
.align align4
|
|
ma32_jumptable:
|
|
.long ma32_case0
|
|
.long ma32_case1
|
|
.long ma32_case2
|
|
.long ma32_case3
|
|
|
|
.align align8
|
|
nop
|
|
nop
|
|
nop # To align loop properly
|
|
|
|
|
|
ma32_case0:
|
|
subl $4,%ebp # U
|
|
jbe ma32_done # V
|
|
|
|
ma32_loop:
|
|
movl 4(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
addl $16,%esi # U
|
|
addl $16,%edi # V
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl -12(%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
addl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,-12(%edi) # V
|
|
ma32_case3:
|
|
movl -8(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl -8(%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
addl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,-8(%edi) # V
|
|
ma32_case2:
|
|
movl -4(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl -4(%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
addl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,-4(%edi) # V
|
|
ma32_case1:
|
|
movl (%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl (%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
addl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,(%edi) # V
|
|
|
|
subl $4,%ebp # U
|
|
ja ma32_loop # V
|
|
|
|
ma32_done:
|
|
popl %ebx # U
|
|
popl %ebp # V
|
|
movl %edx,%eax # U
|
|
popl %edi # V
|
|
popl %esi # U
|
|
ret # NP
|
|
|
|
|
|
.align align16
|
|
lbnMulSub1_32:
|
|
_lbnMulSub1_32:
|
|
pushl %esi # U
|
|
movl 12(%esp),%esi # V load in
|
|
pushl %edi # U
|
|
movl 12(%esp),%edi # V load out
|
|
pushl %ebp # U
|
|
movl 24(%esp),%ebp # V load len
|
|
pushl %ebx # U
|
|
movl 32(%esp),%ecx # V load k
|
|
|
|
/* First multiply step has no carry in. */
|
|
movl (%esi),%eax # V
|
|
movl (%edi),%ebx # U
|
|
mull %ecx # NP first multiply
|
|
subl %eax,%ebx # U
|
|
leal -4(,%ebp,4),%eax # V loop unrolling
|
|
adcl $0,%edx # U
|
|
andl $12,%eax # V loop unrolling
|
|
movl %ebx,(%edi) # U
|
|
|
|
addl %eax,%esi # V loop unrolling
|
|
addl %eax,%edi # U loop unrolling
|
|
|
|
jmp *ms32_jumptable(%eax) # NP loop unrolling
|
|
|
|
.align align4
|
|
ms32_jumptable:
|
|
.long ms32_case0
|
|
.long ms32_case1
|
|
.long ms32_case2
|
|
.long ms32_case3
|
|
|
|
.align align8
|
|
nop
|
|
nop
|
|
nop
|
|
|
|
ms32_case0:
|
|
subl $4,%ebp # U
|
|
jbe ms32_done # V
|
|
|
|
ms32_loop:
|
|
movl 4(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
addl $16,%esi # U
|
|
addl $16,%edi # V
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl -12(%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
subl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,-12(%edi) # V
|
|
ms32_case3:
|
|
movl -8(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl -8(%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
subl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,-8(%edi) # V
|
|
ms32_case2:
|
|
movl -4(%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl -4(%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
subl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,-4(%edi) # V
|
|
ms32_case1:
|
|
movl (%esi),%eax # U
|
|
movl %edx,%ebx # V Remember carry for later
|
|
mull %ecx # NP
|
|
addl %ebx,%eax # U Add carry in from previous word
|
|
movl (%edi),%ebx # V
|
|
adcl $0,%edx # U
|
|
subl %eax,%ebx # V
|
|
adcl $0,%edx # U
|
|
movl %ebx,(%edi) # V
|
|
|
|
subl $4,%ebp # U
|
|
ja ms32_loop # V
|
|
|
|
ms32_done:
|
|
popl %ebx # U
|
|
popl %ebp # V
|
|
movl %edx,%eax # U
|
|
popl %edi # V
|
|
popl %esi # U
|
|
ret # NP
|
|
|
|
## Two-word by one-word divide. Stores quotient, returns remainder.
|
|
## BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
|
|
## 4 8 12 16
|
|
|
|
.align align16
|
|
lbnDiv21_32:
|
|
_lbnDiv21_32:
|
|
movl 8(%esp),%edx # U Load nh
|
|
movl 12(%esp),%eax # V Load nl
|
|
movl 4(%esp),%ecx # U Load q
|
|
divl 16(%esp) # NP
|
|
movl %eax,(%ecx) # U Store quotient
|
|
movl %edx,%eax # V Return remainder
|
|
ret
|
|
|
|
## Multi-word by one-word remainder.
|
|
## This speeds up key generation. It's not worth unrolling and so on;
|
|
## using 32-bit divides is enough of a speedup.
|
|
##
|
|
## The modulus (in %ebp) is often 16 bits. Given that the dividend is 32
|
|
## bits, the chances of saving the first divide because the high word of the
|
|
## dividend is less than the modulus are low enough it's not worth taking
|
|
## the cycles to test for it.
|
|
##
|
|
## unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
|
|
## 4 8 12
|
|
.align align16
|
|
lbnModQ_32:
|
|
_lbnModQ_32:
|
|
movl 4(%esp),%eax # U Load n
|
|
pushl %ebp # V
|
|
movl 12(%esp),%ebp # U Load len
|
|
pushl %esi # V
|
|
leal -4(%eax,%ebp,4),%esi # U
|
|
movl 20(%esp),%ecx # V Load d
|
|
xorl %edx,%edx # U Clear MSW for first divide
|
|
modq32_loop:
|
|
movl (%esi),%eax # U
|
|
subl $4,%esi # V
|
|
divl %ecx # NP
|
|
decl %ebp # U
|
|
jnz modq32_loop # V
|
|
|
|
popl %esi # U
|
|
movl %edx,%eax # V
|
|
popl %ebp # U
|
|
ret # NP
|