mirror of
https://github.com/signalwire/freeswitch.git
synced 2025-02-05 10:34:54 +00:00
d2edcad66e
Thanks to Phil Zimmermann for the code and for the license exception we needed to include it. There remains some build system integration work to be done before this code will build properly in the FreeSWITCH tree.
415 lines
9.1 KiB
NASM
415 lines
9.1 KiB
NASM
;;; Copyright (c) 1995, Colin Plumb.
|
|
;;; For licensing and other legal details, see the file legal.c.
|
|
;;;
|
|
;;; Assembly primitives for bignum library, 80386 family, 32-bit code.
|
|
;;;
|
|
;;; Several primitives are included here. Only lbnMulAdd1 is *really*
|
|
;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
|
|
;;; easy to write as well, so they are included here as well.
|
|
;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
|
|
;;;
|
|
;;; All functions here are for 32-bit flat mode. I.e. near code and
|
|
;;; near data, although the near offsets are 32 bits.
|
|
;;;
|
|
;;; The usual 80x86 calling conventions have AX, BX, CX and DX
|
|
;;; volatile, and SI, DI, SP and BP preserved across calls.
|
|
;;; This includes the "E"xtended forms of all of those registers
|
|
;;;
|
|
;;; However, just to be confusing, recent 32-bit DOS compilers have
|
|
;;; quietly changed that to require EBX preserved across calls, too.
|
|
;;; Joy.
|
|
|
|
.386
|
|
;_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
|
|
;_TEXT ends
|
|
|
|
ifdef @Version
|
|
if @Version le 510
|
|
FLAT group _TEXT
|
|
endif
|
|
else
|
|
FLAT group _TEXT
|
|
endif
|
|
assume cs:FLAT, ds:FLAT, ss:FLAT
|
|
_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
|
|
|
|
public _lbnMulN1_32
|
|
public _lbnMulAdd1_32
|
|
public _lbnMulSub1_32
|
|
public _lbnDiv21_32
|
|
public _lbnModQ_32
|
|
|
|
;; Register usage:
|
|
;; eax - low half of product
|
|
;; ebx - carry to next iteration
|
|
;; ecx - multiplier (k)
|
|
;; edx - high half of product
|
|
;; esi - source pointer
|
|
;; edi - dest pointer
|
|
;; ebp - loop counter
|
|
;;
|
|
;; Stack frame:
|
|
;; +--------+ esp+20 esp+24 esp+28 esp+32 esp+36
|
|
;; | k |
|
|
;; +--------+ esp+16 esp+20 esp+24 esp+28 esp+32
|
|
;; | len |
|
|
;; +--------+ esp+12 esp+16 esp+20 esp+24 esp+28
|
|
;; | in |
|
|
;; +--------+ esp+8 esp+12 esp+16 esp+20 esp+24
|
|
;; | out |
|
|
;; +--------+ esp+4 esp+8 esp+12 esp+16 esp+20
|
|
;; | return |
|
|
;; +--------+ esp esp+4 esp+8 esp+12 esp+16
|
|
;; | esi |
|
|
;; +--------+ esp esp+4 esp+8 esp+12
|
|
;; | ebp |
|
|
;; +--------+ esp esp+4 esp+8
|
|
;; | ebx |
|
|
;; +--------+ esp esp+4
|
|
;; | edi |
|
|
;; +--------+ esp
|
|
|
|
align 16
|
|
_lbnMulN1_32 proc near
|
|
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push ebp ; U
|
|
mov ebp,[esp+20] ; V load len
|
|
push ebx ; U
|
|
mov ecx,[esp+28] ; V load k
|
|
push edi ; U
|
|
mov edi,[esp+20] ; V load out
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,[esi] ; U
|
|
lea ebx,[ebp*4-4] ; V loop unrolling
|
|
mul ecx ; NP first multiply
|
|
mov [edi],eax ; U
|
|
and ebx,12 ; V loop unrolling
|
|
|
|
add esi,ebx ; U loop unrolling
|
|
add edi,ebx ; V loop unrolling
|
|
|
|
jmp DWORD PTR m32_jumptable[ebx] ; NP loop unrolling
|
|
|
|
align 4
|
|
m32_jumptable:
|
|
dd m32_case0
|
|
dd m32_case1
|
|
dd m32_case2
|
|
dd m32_case3
|
|
|
|
nop
|
|
align 8
|
|
nop
|
|
nop
|
|
nop ; Get loop nicely aligned
|
|
|
|
m32_case0:
|
|
sub ebp,4 ; U
|
|
jbe SHORT m32_done ; V
|
|
|
|
m32_loop:
|
|
mov eax,[esi+4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
add esi,16 ; U
|
|
add edi,16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi-12],eax ; V
|
|
m32_case3:
|
|
mov eax,[esi-8] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi-8],eax ; V
|
|
m32_case2:
|
|
mov eax,[esi-4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi-4],eax ; V
|
|
m32_case1:
|
|
mov eax,[esi] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi],eax ; V
|
|
|
|
sub ebp,4 ; U
|
|
ja SHORT m32_loop ; V
|
|
|
|
m32_done:
|
|
mov [edi+4],edx ; U
|
|
pop edi ; V
|
|
pop ebx ; U
|
|
pop ebp ; V
|
|
pop esi ; U
|
|
ret ; NP
|
|
_lbnMulN1_32 endp
|
|
|
|
|
|
align 16
|
|
_lbnMulAdd1_32 proc near
|
|
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push edi ; U
|
|
mov edi,[esp+12] ; V load out
|
|
push ebp ; U
|
|
mov ebp,[esp+24] ; V load len
|
|
push ebx ; U
|
|
mov ecx,[esp+32] ; V load k
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,[esi] ; U
|
|
mov ebx,[edi] ; V
|
|
mul ecx ; NP first multiply
|
|
add ebx,eax ; U
|
|
lea eax,[ebp*4-4] ; V loop unrolling
|
|
adc edx,0 ; U
|
|
and eax,12 ; V loop unrolling
|
|
mov [edi],ebx ; U
|
|
|
|
add esi,eax ; V loop unrolling
|
|
add edi,eax ; U loop unrolling
|
|
|
|
jmp DWORD PTR ma32_jumptable[eax] ; NP loop unrolling
|
|
|
|
align 4
|
|
ma32_jumptable:
|
|
dd ma32_case0
|
|
dd ma32_case1
|
|
dd ma32_case2
|
|
dd ma32_case3
|
|
|
|
nop
|
|
align 8
|
|
nop
|
|
nop
|
|
nop ; To align loop properly
|
|
|
|
|
|
ma32_case0:
|
|
sub ebp,4 ; U
|
|
jbe SHORT ma32_done ; V
|
|
|
|
ma32_loop:
|
|
mov eax,[esi+4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
add esi,16 ; U
|
|
add edi,16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-12] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-12],ebx ; V
|
|
ma32_case3:
|
|
mov eax,[esi-8] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-8] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-8],ebx ; V
|
|
ma32_case2:
|
|
mov eax,[esi-4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-4] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-4],ebx ; V
|
|
ma32_case1:
|
|
mov eax,[esi] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi],ebx ; V
|
|
|
|
sub ebp,4 ; U
|
|
ja SHORT ma32_loop ; V
|
|
|
|
ma32_done:
|
|
pop ebx ; U
|
|
pop ebp ; V
|
|
mov eax,edx ; U
|
|
pop edi ; V
|
|
pop esi ; U
|
|
ret ; NP
|
|
_lbnMulAdd1_32 endp
|
|
|
|
|
|
align 16
|
|
_lbnMulSub1_32 proc near
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push edi ; U
|
|
mov edi,[esp+12] ; V load out
|
|
push ebp ; U
|
|
mov ebp,[esp+24] ; V load len
|
|
push ebx ; U
|
|
mov ecx,[esp+32] ; V load k
|
|
|
|
;; First multiply step has no carry in.
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push edi ; U
|
|
mov edi,[esp+12] ; V load out
|
|
push ebp ; U
|
|
mov ebp,[esp+24] ; V load len
|
|
mov ecx,[esp+28] ; U load k
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,[esi] ; V
|
|
mov ebx,[edi] ; U
|
|
mul ecx ; NP first multiply
|
|
sub ebx,eax ; U
|
|
lea eax,[ebp*4-4] ; V loop unrolling
|
|
adc edx,0 ; U
|
|
and eax,12 ; V loop unrolling
|
|
mov [edi],ebx ; U
|
|
|
|
add esi,eax ; V loop unrolling
|
|
add edi,eax ; U loop unrolling
|
|
|
|
jmp DWORD PTR ms32_jumptable[eax] ; NP loop unrolling
|
|
|
|
align 4
|
|
ms32_jumptable:
|
|
dd ms32_case0
|
|
dd ms32_case1
|
|
dd ms32_case2
|
|
dd ms32_case3
|
|
|
|
nop
|
|
align 8
|
|
nop
|
|
nop
|
|
nop
|
|
|
|
ms32_case0:
|
|
sub ebp,4 ; U
|
|
jbe SHORT ms32_done ; V
|
|
|
|
ms32_loop:
|
|
mov eax,[esi+4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
add esi,16 ; U
|
|
add edi,16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-12] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-12],ebx ; V
|
|
ms32_case3:
|
|
mov eax,[esi-8] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-8] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-8],ebx ; V
|
|
ms32_case2:
|
|
mov eax,[esi-4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-4] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-4],ebx ; V
|
|
ms32_case1:
|
|
mov eax,[esi] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi],ebx ; V
|
|
|
|
sub ebp,4 ; U
|
|
ja SHORT ms32_loop ; V
|
|
|
|
ms32_done:
|
|
pop ebx ; U
|
|
pop ebp ; V
|
|
mov eax,edx ; U
|
|
pop edi ; V
|
|
pop esi ; U
|
|
ret ; NP
|
|
_lbnMulSub1_32 endp
|
|
|
|
|
|
|
|
;; Two-word by one-word divide. Stores quotient, returns remainder.
|
|
;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
|
|
;; 4 8 12 16
|
|
align 4
|
|
_lbnDiv21_32 proc near
|
|
mov edx,[esp+8] ; U Load nh
|
|
mov eax,[esp+12] ; V Load nl
|
|
mov ecx,[esp+4] ; U Load q
|
|
div DWORD PTR [esp+16] ; NP
|
|
mov [ecx],eax ; U Store quotient
|
|
mov eax,edx ; V Return remainder
|
|
ret
|
|
_lbnDiv21_32 endp
|
|
|
|
;; Multi-word by one-word remainder.
|
|
;; This speeds up key generation. It's not worth unrolling and so on;
|
|
;; using 32-bit divides is enough of a speedup.
|
|
;;
|
|
;; The modulus (in ebp) is often 16 bits. Given that the dividend is 32
|
|
;; bits, the chances of saving the first divide because the high word of the
|
|
;; dividend is less than the modulus are low enough it's not worth taking
|
|
;; the cycles to test for it.
|
|
;;
|
|
;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
|
|
;; 4 8 12
|
|
align 4
|
|
_lbnModQ_32 proc near
|
|
mov eax,[esp+4] ; U Load n
|
|
push ebp ; V
|
|
mov ebp,[esp+12] ; U Load len
|
|
push esi ; V
|
|
lea esi,[ebp*4+eax-4] ; U
|
|
mov ecx,[esp+20] ; V Load d
|
|
xor edx,edx ; U Clear edx for first iteration
|
|
modq32_loop:
|
|
mov eax,[esi] ; U Load new low word for divide
|
|
sub esi,4 ; V
|
|
div ecx ; NP edx = edx:eax % ecx
|
|
dec ebp ; U
|
|
jnz SHORT modq32_loop ; V
|
|
|
|
pop esi ; U
|
|
mov eax,edx ; V Return remainder in eax
|
|
pop ebp ; U
|
|
ret ; NP
|
|
_lbnModQ_32 endp
|
|
|
|
end
|