freeswitch/libs/libzrtp/third_party/bnlib/lbn80386.asm

;;; Copyright (c) 1995, Colin Plumb.
;;; For licensing and other legal details, see the file legal.c.
;;;
;;; Assembly primitives for bignum library, 80386 family, 32-bit code.
;;;
;;; Several primitives are included here.  Only lbnMulAdd1 is *really*
;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
;;; easy to write as well, so they are included here as well.
;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
;;;
;;; All functions here are for 32-bit flat mode.  I.e. near code and
;;; near data, although the near offsets are 32 bits.
;;;
;;; The usual 80x86 calling conventions have AX, BX, CX and DX
;;; volatile, and SI, DI, SP and BP preserved across calls.
;;; This includes the "E"xtended forms of all of those registers
;;; 
;;; However, just to be confusing, recent 32-bit DOS compilers have
;;; quietly changed that to require EBX preserved across calls, too.
;;; Joy.

.386
;_TEXT   segment para public use32 'CODE' ; 16-byte aligned because 486 cares
;_TEXT	ends

ifdef @Version
if @Version le 510
FLAT	group	_TEXT
endif
else
FLAT	group	_TEXT
endif
	assume	cs:FLAT, ds:FLAT, ss:FLAT
_TEXT   segment para public use32 'CODE' ; 16-byte aligned because 486 cares

	public  _lbnMulN1_32
	public  _lbnMulAdd1_32
	public  _lbnMulSub1_32
	public	_lbnDiv21_32
	public	_lbnModQ_32

;; Register usage:
;; eax - low half of product
;; ebx - carry to next iteration
;; ecx - multiplier (k)
;; edx - high half of product
;; esi - source pointer
;; edi - dest pointer
;; ebp - loop counter
;;
;; Stack frame:
;; +--------+ esp+20  esp+24  esp+28  esp+32  esp+36
;; |    k   |
;; +--------+ esp+16  esp+20  esp+24  esp+28  esp+32
;; |   len  |
;; +--------+ esp+12  esp+16  esp+20  esp+24  esp+28
;; |   in   |
;; +--------+ esp+8   esp+12  esp+16  esp+20  esp+24
;; |   out  |
;; +--------+ esp+4   esp+8   esp+12  esp+16  esp+20
;; | return |
;; +--------+ esp     esp+4   esp+8   esp+12  esp+16
;; |   esi  |
;; +--------+         esp     esp+4   esp+8   esp+12
;; |   ebp  |
;; +--------+                 esp     esp+4   esp+8
;; |   ebx  |
;; +--------+                         esp     esp+4
;; |   edi  |
;; +--------+                                 esp

	align	16
_lbnMulN1_32	proc	near

	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	ebp		; U
	mov	ebp,[esp+20]	;  V	load len
	push	ebx		; U
	mov	ecx,[esp+28]	;  V	load k
	push	edi		; U
	mov	edi,[esp+20]	;  V	load out

;; First multiply step has no carry in.
	mov	eax,[esi]	; U
	lea	ebx,[ebp*4-4]	;  V	loop unrolling
	mul	ecx		; NP	first multiply
	mov	[edi],eax	; U
	and	ebx,12		;  V	loop unrolling

	add	esi,ebx		; U	loop unrolling
	add	edi,ebx		;  V	loop unrolling

	jmp	DWORD PTR m32_jumptable[ebx]	; NP	loop unrolling

	align	4
m32_jumptable:
	dd	m32_case0
	dd	m32_case1
	dd	m32_case2
	dd	m32_case3

	nop
	align	8
	nop
	nop
	nop	; Get loop nicely aligned

m32_case0:
	sub	ebp,4		; U
	jbe	SHORT m32_done	;  V

m32_loop:
	mov	eax,[esi+4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	add	esi,16		; U
	add	edi,16		;  V
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi-12],eax	;  V
m32_case3:
	mov	eax,[esi-8]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi-8],eax	;  V
m32_case2:
	mov	eax,[esi-4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi-4],eax	;  V
m32_case1:
	mov	eax,[esi]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi],eax	;  V

	sub	ebp,4		; U
	ja	SHORT m32_loop	;  V

m32_done:
	mov	[edi+4],edx	; U
	pop	edi		;  V
	pop	ebx		; U
	pop	ebp		;  V
	pop	esi		; U
	ret			; NP
_lbnMulN1_32	endp


	align	16
_lbnMulAdd1_32	proc	near

	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	edi		; U
	mov	edi,[esp+12]	;  V	load out
	push	ebp		; U
	mov	ebp,[esp+24]	;  V	load len
	push	ebx		; U
	mov	ecx,[esp+32]	;  V	load k

;; First multiply step has no carry in.
	mov	eax,[esi]	; U
	mov	ebx,[edi]	;  V
	mul	ecx		; NP	first multiply
	add	ebx,eax		; U
	lea	eax,[ebp*4-4]	;  V	loop unrolling
	adc	edx,0		; U
	and	eax,12		;  V	loop unrolling
	mov	[edi],ebx	; U

	add	esi,eax		;  V	loop unrolling
	add	edi,eax		; U	loop unrolling

	jmp	DWORD PTR ma32_jumptable[eax]	; NP	loop unrolling

	align	4
ma32_jumptable:
	dd	ma32_case0
	dd	ma32_case1
	dd	ma32_case2
	dd	ma32_case3

	nop
	align	8
	nop
	nop
	nop			; To align loop properly


ma32_case0:
	sub	ebp,4		; U
	jbe	SHORT ma32_done	;  V

ma32_loop:
	mov	eax,[esi+4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	add	esi,16		; U
	add	edi,16		;  V
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-12]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-12],ebx	;  V
ma32_case3:
	mov	eax,[esi-8]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-8]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-8],ebx	;  V
ma32_case2:
	mov	eax,[esi-4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-4]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-4],ebx	;  V
ma32_case1:
	mov	eax,[esi]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi],ebx	;  V

	sub	ebp,4		; U
	ja	SHORT ma32_loop	;  V

ma32_done:
	pop	ebx		; U
	pop	ebp		;  V
	mov	eax,edx		; U
	pop	edi		;  V
	pop	esi		; U
	ret			; NP
_lbnMulAdd1_32	endp


	align	16
_lbnMulSub1_32	proc	near
	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	edi		; U
	mov	edi,[esp+12]	;  V	load out
	push	ebp		; U
	mov	ebp,[esp+24]	;  V	load len
	push	ebx		; U
	mov	ecx,[esp+32]	;  V	load k

;; First multiply step has no carry in.
	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	edi		; U
	mov	edi,[esp+12]	;  V	load out
	push	ebp		; U
	mov	ebp,[esp+24]	;  V	load len
	mov	ecx,[esp+28]	; U	load k

;; First multiply step has no carry in.
	mov	eax,[esi]	;  V
	mov	ebx,[edi]	; U
	mul	ecx		; NP	first multiply
	sub	ebx,eax		; U
	lea	eax,[ebp*4-4]	;  V	loop unrolling
	adc	edx,0		; U
	and	eax,12		;  V	loop unrolling
	mov	[edi],ebx	; U

	add	esi,eax		;  V	loop unrolling
	add	edi,eax		; U	loop unrolling

	jmp	DWORD PTR ms32_jumptable[eax]	; NP	loop unrolling

	align	4
ms32_jumptable:
	dd	ms32_case0
	dd	ms32_case1
	dd	ms32_case2
	dd	ms32_case3

	nop
	align	8
	nop
	nop
	nop

ms32_case0:
	sub	ebp,4		; U
	jbe	SHORT ms32_done	;  V

ms32_loop:
	mov	eax,[esi+4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	add	esi,16		; U
	add	edi,16		;  V
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-12]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-12],ebx	;  V
ms32_case3:
	mov	eax,[esi-8]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-8]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-8],ebx	;  V
ms32_case2:
	mov	eax,[esi-4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-4]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-4],ebx	;  V
ms32_case1:
	mov	eax,[esi]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi],ebx	;  V

	sub	ebp,4		; U
	ja	SHORT ms32_loop	;  V

ms32_done:
	pop	ebx		; U
	pop	ebp		;  V
	mov	eax,edx		; U
	pop	edi		;  V
	pop	esi		; U
	ret			; NP
_lbnMulSub1_32	endp


;; Two-word by one-word divide.  Stores quotient, returns remainder.
;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
;;                      4            8            12           16
align 4
_lbnDiv21_32	proc	near
	mov	edx,[esp+8]		; U	Load nh
	mov	eax,[esp+12]		;  V	Load nl
	mov	ecx,[esp+4]		; U	Load q
	div	DWORD PTR [esp+16]	; NP
	mov	[ecx],eax		; U	Store quotient
	mov	eax,edx			;  V	Return remainder
	ret
_lbnDiv21_32	endp

;; Multi-word by one-word remainder.
;; This speeds up key generation.  It's not worth unrolling and so on;
;; using 32-bit divides is enough of a speedup.
;;
;; The modulus (in ebp) is often 16 bits.  Given that the dividend is 32
;; bits, the chances of saving the first divide because the high word of the
;; dividend is less than the modulus are low enough it's not worth taking
;; the cycles to test for it.
;;
;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
;;                     4                  8             12
align 4
_lbnModQ_32	proc	near
	mov	eax,[esp+4]		; U	Load n
	push	ebp			;  V
	mov	ebp,[esp+12]		; U	Load len
	push	esi			;  V
	lea	esi,[ebp*4+eax-4]	; U
	mov	ecx,[esp+20]		;  V	Load d
	xor	edx,edx			; U	Clear edx for first iteration
modq32_loop:
	mov	eax,[esi]		; U	Load new low word for divide
	sub	esi,4			;  V
	div	ecx			; NP	edx = edx:eax % ecx
	dec	ebp			; U
	jnz	SHORT modq32_loop	;  V

	pop	esi			; U
	mov	eax,edx			;  V	Return remainder in eax
	pop	ebp			; U
	ret				; NP
_lbnModQ_32	endp

	end
Add libzrtp beta 2010-02-20 18:51:54 +00:00			`;;; Copyright (c) 1995, Colin Plumb.`
			`;;; For licensing and other legal details, see the file legal.c.`
			`;;;`
Relicense bnlib under GPLv2/GPLv3 with exception for FreeSWITCH Thanks to Travis Cross for much of the language here. Signed-off-by: Philip Zimmermann <prz@mit.edu> Signed-off-by: Travis Cross <tc@traviscross.com> 2012-03-31 20:22:13 +00:00			`;;; Assembly primitives for bignum library, 80386 family, 32-bit code.`
			`;;;`
Add libzrtp beta 2010-02-20 18:51:54 +00:00			`;;; Several primitives are included here. Only lbnMulAdd1 is really`
			`;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite`
			`;;; easy to write as well, so they are included here as well.`
			`;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.`
			`;;;`
			`;;; All functions here are for 32-bit flat mode. I.e. near code and`
			`;;; near data, although the near offsets are 32 bits.`
			`;;;`
			`;;; The usual 80x86 calling conventions have AX, BX, CX and DX`
			`;;; volatile, and SI, DI, SP and BP preserved across calls.`
			`;;; This includes the "E"xtended forms of all of those registers`
			`;;;`
			`;;; However, just to be confusing, recent 32-bit DOS compilers have`
			`;;; quietly changed that to require EBX preserved across calls, too.`
			`;;; Joy.`

			`.386`
			`;_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares`
			`;_TEXT ends`

			`ifdef @Version`
			`if @Version le 510`
			`FLAT group _TEXT`
			`endif`
			`else`
			`FLAT group _TEXT`
			`endif`
			`assume cs:FLAT, ds:FLAT, ss:FLAT`
			`_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares`

			`public _lbnMulN1_32`
			`public _lbnMulAdd1_32`
			`public _lbnMulSub1_32`
			`public _lbnDiv21_32`
			`public _lbnModQ_32`

			`;; Register usage:`
			`;; eax - low half of product`
			`;; ebx - carry to next iteration`
			`;; ecx - multiplier (k)`
			`;; edx - high half of product`
			`;; esi - source pointer`
			`;; edi - dest pointer`
			`;; ebp - loop counter`
			`;;`
			`;; Stack frame:`
			`;; +--------+ esp+20 esp+24 esp+28 esp+32 esp+36`
			`;; \| k \|`
			`;; +--------+ esp+16 esp+20 esp+24 esp+28 esp+32`
			`;; \| len \|`
			`;; +--------+ esp+12 esp+16 esp+20 esp+24 esp+28`
			`;; \| in \|`
			`;; +--------+ esp+8 esp+12 esp+16 esp+20 esp+24`
			`;; \| out \|`
			`;; +--------+ esp+4 esp+8 esp+12 esp+16 esp+20`
			`;; \| return \|`
			`;; +--------+ esp esp+4 esp+8 esp+12 esp+16`
			`;; \| esi \|`
			`;; +--------+ esp esp+4 esp+8 esp+12`
			`;; \| ebp \|`
			`;; +--------+ esp esp+4 esp+8`
			`;; \| ebx \|`
			`;; +--------+ esp esp+4`
			`;; \| edi \|`
			`;; +--------+ esp`

			`align 16`
			`_lbnMulN1_32 proc near`

			`push esi ; U`
			`mov esi,[esp+12] ; V load in`
			`push ebp ; U`
			`mov ebp,[esp+20] ; V load len`
			`push ebx ; U`
			`mov ecx,[esp+28] ; V load k`
			`push edi ; U`
			`mov edi,[esp+20] ; V load out`

			`;; First multiply step has no carry in.`
			`mov eax,[esi] ; U`
			`lea ebx,[ebp*4-4] ; V loop unrolling`
			`mul ecx ; NP first multiply`
			`mov [edi],eax ; U`
			`and ebx,12 ; V loop unrolling`

			`add esi,ebx ; U loop unrolling`
			`add edi,ebx ; V loop unrolling`

			`jmp DWORD PTR m32_jumptable[ebx] ; NP loop unrolling`

			`align 4`
			`m32_jumptable:`
			`dd m32_case0`
			`dd m32_case1`
			`dd m32_case2`
			`dd m32_case3`

			`nop`
			`align 8`
			`nop`
			`nop`
			`nop ; Get loop nicely aligned`

			`m32_case0:`
			`sub ebp,4 ; U`
			`jbe SHORT m32_done ; V`

			`m32_loop:`
			`mov eax,[esi+4] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`add esi,16 ; U`
			`add edi,16 ; V`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`adc edx,0 ; U`
			`mov [edi-12],eax ; V`
			`m32_case3:`
			`mov eax,[esi-8] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`adc edx,0 ; U`
			`mov [edi-8],eax ; V`
			`m32_case2:`
			`mov eax,[esi-4] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`adc edx,0 ; U`
			`mov [edi-4],eax ; V`
			`m32_case1:`
			`mov eax,[esi] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`adc edx,0 ; U`
			`mov [edi],eax ; V`

			`sub ebp,4 ; U`
			`ja SHORT m32_loop ; V`

			`m32_done:`
			`mov [edi+4],edx ; U`
			`pop edi ; V`
			`pop ebx ; U`
			`pop ebp ; V`
			`pop esi ; U`
			`ret ; NP`
			`_lbnMulN1_32 endp`


			`align 16`
			`_lbnMulAdd1_32 proc near`

			`push esi ; U`
			`mov esi,[esp+12] ; V load in`
			`push edi ; U`
			`mov edi,[esp+12] ; V load out`
			`push ebp ; U`
			`mov ebp,[esp+24] ; V load len`
			`push ebx ; U`
			`mov ecx,[esp+32] ; V load k`

			`;; First multiply step has no carry in.`
			`mov eax,[esi] ; U`
			`mov ebx,[edi] ; V`
			`mul ecx ; NP first multiply`
			`add ebx,eax ; U`
			`lea eax,[ebp*4-4] ; V loop unrolling`
			`adc edx,0 ; U`
			`and eax,12 ; V loop unrolling`
			`mov [edi],ebx ; U`

			`add esi,eax ; V loop unrolling`
			`add edi,eax ; U loop unrolling`

			`jmp DWORD PTR ma32_jumptable[eax] ; NP loop unrolling`

			`align 4`
			`ma32_jumptable:`
			`dd ma32_case0`
			`dd ma32_case1`
			`dd ma32_case2`
			`dd ma32_case3`

			`nop`
			`align 8`
			`nop`
			`nop`
			`nop ; To align loop properly`


			`ma32_case0:`
			`sub ebp,4 ; U`
			`jbe SHORT ma32_done ; V`

			`ma32_loop:`
			`mov eax,[esi+4] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`add esi,16 ; U`
			`add edi,16 ; V`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi-12] ; V`
			`adc edx,0 ; U`
			`add ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi-12],ebx ; V`
			`ma32_case3:`
			`mov eax,[esi-8] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi-8] ; V`
			`adc edx,0 ; U`
			`add ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi-8],ebx ; V`
			`ma32_case2:`
			`mov eax,[esi-4] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi-4] ; V`
			`adc edx,0 ; U`
			`add ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi-4],ebx ; V`
			`ma32_case1:`
			`mov eax,[esi] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi] ; V`
			`adc edx,0 ; U`
			`add ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi],ebx ; V`

			`sub ebp,4 ; U`
			`ja SHORT ma32_loop ; V`

			`ma32_done:`
			`pop ebx ; U`
			`pop ebp ; V`
			`mov eax,edx ; U`
			`pop edi ; V`
			`pop esi ; U`
			`ret ; NP`
			`_lbnMulAdd1_32 endp`


			`align 16`
			`_lbnMulSub1_32 proc near`
			`push esi ; U`
			`mov esi,[esp+12] ; V load in`
			`push edi ; U`
			`mov edi,[esp+12] ; V load out`
			`push ebp ; U`
			`mov ebp,[esp+24] ; V load len`
			`push ebx ; U`
			`mov ecx,[esp+32] ; V load k`

			`;; First multiply step has no carry in.`
			`push esi ; U`
			`mov esi,[esp+12] ; V load in`
			`push edi ; U`
			`mov edi,[esp+12] ; V load out`
			`push ebp ; U`
			`mov ebp,[esp+24] ; V load len`
			`mov ecx,[esp+28] ; U load k`

			`;; First multiply step has no carry in.`
			`mov eax,[esi] ; V`
			`mov ebx,[edi] ; U`
			`mul ecx ; NP first multiply`
			`sub ebx,eax ; U`
			`lea eax,[ebp*4-4] ; V loop unrolling`
			`adc edx,0 ; U`
			`and eax,12 ; V loop unrolling`
			`mov [edi],ebx ; U`

			`add esi,eax ; V loop unrolling`
			`add edi,eax ; U loop unrolling`

			`jmp DWORD PTR ms32_jumptable[eax] ; NP loop unrolling`

			`align 4`
			`ms32_jumptable:`
			`dd ms32_case0`
			`dd ms32_case1`
			`dd ms32_case2`
			`dd ms32_case3`

			`nop`
			`align 8`
			`nop`
			`nop`
			`nop`

			`ms32_case0:`
			`sub ebp,4 ; U`
			`jbe SHORT ms32_done ; V`

			`ms32_loop:`
			`mov eax,[esi+4] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`add esi,16 ; U`
			`add edi,16 ; V`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi-12] ; V`
			`adc edx,0 ; U`
			`sub ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi-12],ebx ; V`
			`ms32_case3:`
			`mov eax,[esi-8] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi-8] ; V`
			`adc edx,0 ; U`
			`sub ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi-8],ebx ; V`
			`ms32_case2:`
			`mov eax,[esi-4] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi-4] ; V`
			`adc edx,0 ; U`
			`sub ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi-4],ebx ; V`
			`ms32_case1:`
			`mov eax,[esi] ; U`
			`mov ebx,edx ; V Remember carry for later`
			`mul ecx ; NP`
			`add eax,ebx ; U Add carry in from previous word`
			`mov ebx,[edi] ; V`
			`adc edx,0 ; U`
			`sub ebx,eax ; V`
			`adc edx,0 ; U`
			`mov [edi],ebx ; V`

			`sub ebp,4 ; U`
			`ja SHORT ms32_loop ; V`

			`ms32_done:`
			`pop ebx ; U`
			`pop ebp ; V`
			`mov eax,edx ; U`
			`pop edi ; V`
			`pop esi ; U`
			`ret ; NP`
			`_lbnMulSub1_32 endp`



			`;; Two-word by one-word divide. Stores quotient, returns remainder.`
			`;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)`
			`;; 4 8 12 16`
			`align 4`
			`_lbnDiv21_32 proc near`
			`mov edx,[esp+8] ; U Load nh`
			`mov eax,[esp+12] ; V Load nl`
			`mov ecx,[esp+4] ; U Load q`
			`div DWORD PTR [esp+16] ; NP`
			`mov [ecx],eax ; U Store quotient`
			`mov eax,edx ; V Return remainder`
			`ret`
			`_lbnDiv21_32 endp`

			`;; Multi-word by one-word remainder.`
			`;; This speeds up key generation. It's not worth unrolling and so on;`
			`;; using 32-bit divides is enough of a speedup.`
			`;;`
			`;; The modulus (in ebp) is often 16 bits. Given that the dividend is 32`
			`;; bits, the chances of saving the first divide because the high word of the`
			`;; dividend is less than the modulus are low enough it's not worth taking`
			`;; the cycles to test for it.`
			`;;`
			`;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)`
			`;; 4 8 12`
			`align 4`
			`_lbnModQ_32 proc near`
			`mov eax,[esp+4] ; U Load n`
			`push ebp ; V`
			`mov ebp,[esp+12] ; U Load len`
			`push esi ; V`
			`lea esi,[ebp*4+eax-4] ; U`
			`mov ecx,[esp+20] ; V Load d`
			`xor edx,edx ; U Clear edx for first iteration`
			`modq32_loop:`
			`mov eax,[esi] ; U Load new low word for divide`
			`sub esi,4 ; V`
			`div ecx ; NP edx = edx:eax % ecx`
			`dec ebp ; U`
			`jnz SHORT modq32_loop ; V`

			`pop esi ; U`
			`mov eax,edx ; V Return remainder in eax`
			`pop ebp ; U`
			`ret ; NP`
			`_lbnModQ_32 endp`

			`end`