Travis Cross d2edcad66e Merge Phil Zimmermann's libzrtp as a FreeSWITCH library
Thanks to Phil Zimmermann for the code and for the license exception
we needed to include it.

There remains some build system integration work to be done before
this code will build properly in the FreeSWITCH tree.
2012-03-31 23:42:27 +00:00

395 lines
9.2 KiB
ArmAsm

### Copyright (c) 1995, Colin Plumb.
### For licensing and other legal details, see the file legal.c.
###
### Assembly primitives for bignum library, 80386 family, 32-bit code.
###
### Several primitives are included here. Only lbnMulAdd1 is *really*
### critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
### easy to write as well, so they are included here as well.
### lbnDiv21 and lbnModQ are so easy to write that they're included, too.
###
### All functions here are for 32-bit flat mode. I.e. near code and
### near data, although the near offsets are 32 bits.
### Preserved registers are esp, ebp, esi, edi and ebx. That last
### is needed by ELF for PIC, and differs from the IBM PC calling
### convention.
# Different assemblers have different conventions here
align4=4 # could be 2 or 4
align8=8 # could be 3 or 8
align16=16 # cound be 4 or 16
.text
# We declare each symbol with two names, to deal with ELF/a.out variances.
.globl lbnMulN1_32
.globl _lbnMulN1_32
.globl lbnMulAdd1_32
.globl _lbnMulAdd1_32
.globl lbnMulSub1_32
.globl _lbnMulSub1_32
.globl lbnDiv21_32
.globl _lbnDiv21_32
.globl lbnModQ_32
.globl _lbnModQ_32
## Register usage:
## %eax - low half of product
## %ebx - carry to next iteration
## %ecx - multiplier (k)
## %edx - high half of product
## %esi - source pointer
## %edi - dest pointer
## %ebp - loop counter
##
## Stack frame:
## +--------+ %esp+20 %esp+24 %esp+28 %esp+32 %esp+36
## | k |
## +--------+ %esp+16 %esp+20 %esp+24 %esp+28 %esp+32
## | len |
## +--------+ %esp+12 %esp+16 %esp+20 %esp+24 %esp+28
## | in |
## +--------+ %esp+8 %esp+12 %esp+16 %esp+20 %esp+24
## | out |
## +--------+ %esp+4 %esp+8 %esp+12 %esp+16 %esp+20
## | return |
## +--------+ %esp %esp+4 %esp+8 %esp+12 %esp+16
## | %esi |
## +--------+ %esp %esp+4 %esp+8 %esp+12
## | %ebp |
## +--------+ %esp %esp+4 %esp+8
## | %ebx |
## +--------+ %esp %esp+4
## | %edi |
## +--------+ %esp
.align align16
lbnMulN1_32:
_lbnMulN1_32:
pushl %esi # U
movl 12(%esp),%esi # V load in
pushl %ebp # U
movl 20(%esp),%ebp # V load len
pushl %ebx # U
movl 28(%esp),%ecx # V load k
pushl %edi # U
movl 20(%esp),%edi # V load out
## First multiply step has no carry in.
movl (%esi),%eax # V
leal -4(,%ebp,4),%ebx # U loop unrolling
mull %ecx # NP first multiply
movl %eax,(%edi) # U
andl $12,%ebx # V loop unrolling
addl %ebx,%esi # U loop unrolling
addl %ebx,%edi # V loop unrolling
jmp *m32_jumptable(%ebx) # NP loop unrolling
.align align4
m32_jumptable:
.long m32_case0
.long m32_case1
.long m32_case2
.long m32_case3
nop
.align align8
nop
nop
nop # Get loop nicely aligned
m32_case0:
subl $4,%ebp # U
jbe m32_done # V
m32_loop:
movl 4(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
addl $16,%esi # U
addl $16,%edi # V
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
adcl $0,%edx # U
movl %eax,-12(%edi) # V
m32_case3:
movl -8(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
adcl $0,%edx # U
movl %eax,-8(%edi) # V
m32_case2:
movl -4(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
adcl $0,%edx # U
movl %eax,-4(%edi) # V
m32_case1:
movl (%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
adcl $0,%edx # U
movl %eax,(%edi) # V
subl $4,%ebp # U
ja m32_loop # V
m32_done:
movl %edx,4(%edi) # U
popl %edi # V
popl %ebx # U
popl %ebp # V
popl %esi # U
ret # NP
.align align16
lbnMulAdd1_32:
_lbnMulAdd1_32:
pushl %esi # U
movl 12(%esp),%esi # V load in
pushl %edi # U
movl 12(%esp),%edi # V load out
pushl %ebp # U
movl 24(%esp),%ebp # V load len
pushl %ebx # U
movl 32(%esp),%ecx # V load k
## First multiply step has no carry in.
movl (%esi),%eax # V
movl (%edi),%ebx # U
mull %ecx # NP first multiply
addl %eax,%ebx # U
leal -4(,%ebp,4),%eax # V loop unrolling
adcl $0,%edx # U
andl $12,%eax # V loop unrolling
movl %ebx,(%edi) # U
addl %eax,%esi # V loop unrolling
addl %eax,%edi # U loop unrolling
jmp *ma32_jumptable(%eax) # NP loop unrolling
.align align4
ma32_jumptable:
.long ma32_case0
.long ma32_case1
.long ma32_case2
.long ma32_case3
.align align8
nop
nop
nop # To align loop properly
ma32_case0:
subl $4,%ebp # U
jbe ma32_done # V
ma32_loop:
movl 4(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
addl $16,%esi # U
addl $16,%edi # V
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl -12(%edi),%ebx # V
adcl $0,%edx # U
addl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,-12(%edi) # V
ma32_case3:
movl -8(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl -8(%edi),%ebx # V
adcl $0,%edx # U
addl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,-8(%edi) # V
ma32_case2:
movl -4(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl -4(%edi),%ebx # V
adcl $0,%edx # U
addl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,-4(%edi) # V
ma32_case1:
movl (%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl (%edi),%ebx # V
adcl $0,%edx # U
addl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,(%edi) # V
subl $4,%ebp # U
ja ma32_loop # V
ma32_done:
popl %ebx # U
popl %ebp # V
movl %edx,%eax # U
popl %edi # V
popl %esi # U
ret # NP
.align align16
lbnMulSub1_32:
_lbnMulSub1_32:
pushl %esi # U
movl 12(%esp),%esi # V load in
pushl %edi # U
movl 12(%esp),%edi # V load out
pushl %ebp # U
movl 24(%esp),%ebp # V load len
pushl %ebx # U
movl 32(%esp),%ecx # V load k
/* First multiply step has no carry in. */
movl (%esi),%eax # V
movl (%edi),%ebx # U
mull %ecx # NP first multiply
subl %eax,%ebx # U
leal -4(,%ebp,4),%eax # V loop unrolling
adcl $0,%edx # U
andl $12,%eax # V loop unrolling
movl %ebx,(%edi) # U
addl %eax,%esi # V loop unrolling
addl %eax,%edi # U loop unrolling
jmp *ms32_jumptable(%eax) # NP loop unrolling
.align align4
ms32_jumptable:
.long ms32_case0
.long ms32_case1
.long ms32_case2
.long ms32_case3
.align align8
nop
nop
nop
ms32_case0:
subl $4,%ebp # U
jbe ms32_done # V
ms32_loop:
movl 4(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
addl $16,%esi # U
addl $16,%edi # V
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl -12(%edi),%ebx # V
adcl $0,%edx # U
subl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,-12(%edi) # V
ms32_case3:
movl -8(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl -8(%edi),%ebx # V
adcl $0,%edx # U
subl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,-8(%edi) # V
ms32_case2:
movl -4(%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl -4(%edi),%ebx # V
adcl $0,%edx # U
subl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,-4(%edi) # V
ms32_case1:
movl (%esi),%eax # U
movl %edx,%ebx # V Remember carry for later
mull %ecx # NP
addl %ebx,%eax # U Add carry in from previous word
movl (%edi),%ebx # V
adcl $0,%edx # U
subl %eax,%ebx # V
adcl $0,%edx # U
movl %ebx,(%edi) # V
subl $4,%ebp # U
ja ms32_loop # V
ms32_done:
popl %ebx # U
popl %ebp # V
movl %edx,%eax # U
popl %edi # V
popl %esi # U
ret # NP
## Two-word by one-word divide. Stores quotient, returns remainder.
## BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
## 4 8 12 16
.align align16
lbnDiv21_32:
_lbnDiv21_32:
movl 8(%esp),%edx # U Load nh
movl 12(%esp),%eax # V Load nl
movl 4(%esp),%ecx # U Load q
divl 16(%esp) # NP
movl %eax,(%ecx) # U Store quotient
movl %edx,%eax # V Return remainder
ret
## Multi-word by one-word remainder.
## This speeds up key generation. It's not worth unrolling and so on;
## using 32-bit divides is enough of a speedup.
##
## The modulus (in %ebp) is often 16 bits. Given that the dividend is 32
## bits, the chances of saving the first divide because the high word of the
## dividend is less than the modulus are low enough it's not worth taking
## the cycles to test for it.
##
## unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
## 4 8 12
.align align16
lbnModQ_32:
_lbnModQ_32:
movl 4(%esp),%eax # U Load n
pushl %ebp # V
movl 12(%esp),%ebp # U Load len
pushl %esi # V
leal -4(%eax,%ebp,4),%esi # U
movl 20(%esp),%ecx # V Load d
xorl %edx,%edx # U Clear MSW for first divide
modq32_loop:
movl (%esi),%eax # U
subl $4,%esi # V
divl %ecx # NP
decl %ebp # U
jnz modq32_loop # V
popl %esi # U
movl %edx,%eax # V
popl %ebp # U
ret # NP