2012-03-31 20:22:13 +00:00
|
|
|
* Copyright (c) 1995 Colin Plumb. All rights reserved.
|
|
|
|
* For licensing and other legal details, see the file legal.c.
|
|
|
|
*
|
2010-02-20 18:51:54 +00:00
|
|
|
* lbn68360.c - 32-bit bignum primitives for 683xx processors.
|
|
|
|
*
|
|
|
|
* This code is using InterTools calling convention, which is a bit odd.
|
|
|
|
* One minor note is that the default variable sizes are
|
|
|
|
* char = unsigned 8, short = 8 (in violation of ANSI!),
|
|
|
|
* int = 16, long = 32. Longs (including on the stack) are 16-bit aligned.
|
|
|
|
* Arguments are apdded to 16 bits.
|
|
|
|
* A6 is used as a frame pointer, and globals are indexed off A5.
|
|
|
|
* Return valies are passes id D0 or A0 (or FP0), depending on type.
|
|
|
|
* D0, D1, A0 and A4 (!) are volatile across function calls. A1
|
|
|
|
* must be preserved!
|
|
|
|
*
|
|
|
|
* This code assumes 16-bit ints. Code for 32-bit ints is commented out
|
|
|
|
* with "**".
|
|
|
|
*
|
|
|
|
* Regardless of UINT_MAX, only bignums up to 64K words (2 million bits)
|
|
|
|
* are supported. (68k hackers will recognize this as a consequence of
|
|
|
|
* using dbra.) This could be extended easily if anyone cares.
|
|
|
|
*
|
|
|
|
* These primitives use little-endian word order.
|
|
|
|
* (The order of bytes within words is irrelevant to this issue.)
|
|
|
|
|
|
|
|
* The Metrowerks C compiler (1.2.2) produces bad 68k code for the
|
|
|
|
* following input, which happens to be the inner loop of lbnSub1,
|
|
|
|
* so it has been rewritees in assembly, even though it is not terribly
|
|
|
|
* speed-critical. (Optimizer on or off does not matter.)
|
|
|
|
*
|
|
|
|
* unsigned
|
|
|
|
* decrement(unsigned *num, unsigned len)
|
|
|
|
* {
|
|
|
|
* do {
|
|
|
|
* if ((*num++)-- != 0)
|
|
|
|
* return 0;
|
|
|
|
* } while (--len);
|
|
|
|
* return 1;
|
|
|
|
* }
|
|
|
|
|
|
|
|
* BNWORD32 lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
|
|
|
|
SECTION S_lbnSub1_32,,"code"
|
|
|
|
XDEF _lbnSub1_32
|
|
|
|
_lbnSub1_32:
|
|
|
|
movea.l 4(sp),a0 * num
|
|
|
|
move.l 10(sp),d0 * borrow
|
|
|
|
** move.l 12(sp),d0 * borrow
|
|
|
|
sub.l d0,(a0)+
|
|
|
|
bcc sub_done
|
|
|
|
move.w 8(sp),d0 * len
|
|
|
|
** move.w 10(sp),d0 * len
|
|
|
|
subq.w #2,d0
|
|
|
|
bcs sub_done
|
|
|
|
sub_loop:
|
|
|
|
subq.l #1,(a0)+
|
|
|
|
dbcc d0,sub_loop
|
|
|
|
sub_done:
|
|
|
|
moveq.l #0,d0
|
|
|
|
addx.w d0,d0
|
|
|
|
rts
|
|
|
|
|
|
|
|
* BNWORD32 lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
|
|
|
|
SECTION S_lbnAdd1_32,,"code"
|
|
|
|
XDEF _lbnAdd1_32
|
|
|
|
_lbnAdd1_32:
|
|
|
|
movea.l 4(sp),a0 * num
|
|
|
|
move.l 10(sp),d0 * carry
|
|
|
|
** move.l 12(sp),d0 * carry
|
|
|
|
add.l d0,(a0)+
|
|
|
|
bcc add_done
|
|
|
|
move.w 8(sp),d0 * len
|
|
|
|
** move.w 10(sp),d0 * len
|
|
|
|
subq.w #2,d0
|
|
|
|
bcs add_done
|
|
|
|
add_loop:
|
|
|
|
addq.l #1,(a0)+
|
|
|
|
dbcc d0,add_loop
|
|
|
|
add_done:
|
|
|
|
moveq.l #0,d0
|
|
|
|
addx.w d0,d0
|
|
|
|
rts
|
|
|
|
|
|
|
|
* void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
|
|
SECTION S_lbnMulN1_32,,"code"
|
|
|
|
XDEF _lbnMulN1_32
|
|
|
|
_lbnMulN1_32:
|
|
|
|
movem.l d2-d5,-(sp) * 16 bytes of extra data
|
|
|
|
moveq.l #0,d4
|
|
|
|
move.l 20(sp),a4 * out
|
|
|
|
move.l 24(sp),a0 * in
|
|
|
|
move.w 28(sp),d5 * len
|
|
|
|
move.l 30(sp),d2 * k
|
|
|
|
** move.w 30(sp),d5 * len
|
|
|
|
** move.l 32(sp),d2 * k
|
|
|
|
|
|
|
|
move.l (a0)+,d3 * First multiply
|
|
|
|
mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
|
|
|
|
move.l d3,(a4)+
|
|
|
|
|
|
|
|
subq.w #1,d5 * Setup for loop unrolling
|
|
|
|
lsr.w #1,d5
|
|
|
|
bcs.s m32_even
|
|
|
|
beq.s m32_short
|
|
|
|
|
|
|
|
subq.w #1,d5 * Set up software pipeline properly
|
|
|
|
move.l d1,d0
|
|
|
|
|
|
|
|
m32_loop:
|
|
|
|
move.l (a0)+,d3
|
|
|
|
mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
|
|
|
|
add.l d0,d3
|
|
|
|
addx.l d4,d1
|
|
|
|
move.l d3,(a4)+
|
|
|
|
m32_even:
|
|
|
|
|
|
|
|
move.l (a0)+,d3
|
|
|
|
mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
|
|
|
|
add.l d1,d3
|
|
|
|
addx.l d4,d0
|
|
|
|
move.l d3,(a4)+
|
|
|
|
|
|
|
|
dbra d5,m32_loop
|
|
|
|
|
|
|
|
move.l d0,(a4)
|
|
|
|
movem.l (sp)+,d2-d5
|
|
|
|
rts
|
|
|
|
m32_short:
|
|
|
|
move.l d1,(a4)
|
|
|
|
movem.l (sp)+,d2-d5
|
|
|
|
rts
|
|
|
|
|
|
|
|
* BNWORD32
|
|
|
|
* lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
|
|
SECTION S_lbnMulAdd1_32,,"code"
|
|
|
|
XDEF _lbnMulAdd1_32
|
|
|
|
_lbnMulAdd1_32:
|
|
|
|
movem.l d2-d5,-(sp) * 16 bytes of extra data
|
|
|
|
moveq.l #0,d4
|
|
|
|
move.l 20(sp),a4 * out
|
|
|
|
move.l 24(sp),a0 * in
|
|
|
|
move.w 28(sp),d5 * len
|
|
|
|
move.l 30(sp),d2 * k
|
|
|
|
** move.w 30(sp),d5 * len
|
|
|
|
** move.l 32(sp),d2 * k
|
|
|
|
|
|
|
|
move.l (a0)+,d3 * First multiply
|
|
|
|
mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
|
|
|
|
add.l d3,(a4)+
|
|
|
|
addx.l d4,d1
|
|
|
|
|
|
|
|
subq.w #1,d5 * Setup for loop unrolling
|
|
|
|
lsr.w #1,d5
|
|
|
|
bcs.s ma32_even
|
|
|
|
beq.s ma32_short
|
|
|
|
|
|
|
|
subq.w #1,d5 * Set up software pipeline properly
|
|
|
|
move.l d1,d0
|
|
|
|
|
|
|
|
ma32_loop:
|
|
|
|
move.l (a0)+,d3
|
|
|
|
mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
|
|
|
|
add.l d0,d3
|
|
|
|
addx.l d4,d1
|
|
|
|
add.l d3,(a4)+
|
|
|
|
addx.l d4,d1
|
|
|
|
ma32_even:
|
|
|
|
|
|
|
|
move.l (a0)+,d3
|
|
|
|
mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
|
|
|
|
add.l d1,d3
|
|
|
|
addx.l d4,d0
|
|
|
|
add.l d3,(a4)+
|
|
|
|
addx.l d4,d0
|
|
|
|
|
|
|
|
dbra d5,ma32_loop
|
|
|
|
|
|
|
|
movem.l (sp)+,d2-d5
|
|
|
|
rts
|
|
|
|
ma32_short:
|
|
|
|
move.l d1,d0
|
|
|
|
movem.l (sp)+,d2-d5
|
|
|
|
rts
|
|
|
|
|
|
|
|
* BNWORD32
|
|
|
|
* lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
|
|
SECTION S_lbnMulSub1_32,,"code"
|
|
|
|
XDEF _lbnMulSub1_32
|
|
|
|
_lbnMulSub1_32:
|
|
|
|
movem.l d2-d5,-(sp) * 16 bytes of extra data
|
|
|
|
moveq.l #0,d4
|
|
|
|
move.l 20(sp),a4 * out
|
|
|
|
move.l 24(sp),a0 * in
|
|
|
|
move.w 28(sp),d5 * len
|
|
|
|
move.l 30(sp),d2 * k
|
|
|
|
** move.w 30(sp),d5 * len
|
|
|
|
** move.l 32(sp),d2 * k
|
|
|
|
|
|
|
|
move.l (a0)+,d3 * First multiply
|
|
|
|
mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
|
|
|
|
sub.l d3,(a4)+
|
|
|
|
addx.l d4,d1
|
|
|
|
|
|
|
|
subq.w #1,d5 * Setup for loop unrolling
|
|
|
|
lsr.w #1,d5
|
|
|
|
bcs.s ms32_even
|
|
|
|
beq.s ms32_short
|
|
|
|
|
|
|
|
subq.w #1,d5 * Set up software pipeline properly
|
|
|
|
move.l d1,d0
|
|
|
|
|
|
|
|
ms32_loop:
|
|
|
|
move.l (a0)+,d3
|
|
|
|
mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
|
|
|
|
add.l d0,d3
|
|
|
|
addx.l d4,d1
|
|
|
|
sub.l d3,(a4)+
|
|
|
|
addx.l d4,d1
|
|
|
|
ms32_even:
|
|
|
|
|
|
|
|
move.l (a0)+,d3
|
|
|
|
mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
|
|
|
|
add.l d1,d3
|
|
|
|
addx.l d4,d0
|
|
|
|
sub.l d3,(a4)+
|
|
|
|
addx.l d4,d0
|
|
|
|
|
|
|
|
dbra d5,ms32_loop
|
|
|
|
|
|
|
|
movem.l (sp)+,d2-d5
|
|
|
|
rts
|
|
|
|
|
|
|
|
ms32_short:
|
|
|
|
move.l d1,d0
|
|
|
|
movem.l (sp)+,d2-d5
|
|
|
|
rts
|
|
|
|
|
|
|
|
|
|
|
|
* BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
|
|
|
|
SECTION S_lbnDiv21_32,,"code"
|
|
|
|
XDEF _lbnDiv21_32
|
|
|
|
_lbnDiv21_32:
|
|
|
|
move.l 8(sp),d0
|
|
|
|
move.l 12(sp),d1
|
|
|
|
move.l 4(sp),a0
|
|
|
|
divu.l 16(sp),d0:d1 * dc.w 0x4c6f, 0x1400, 16
|
|
|
|
move.l d1,(a0)
|
|
|
|
rts
|
|
|
|
|
|
|
|
* unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
|
|
|
|
SECTION S_lbnModQ_32,,"code"
|
|
|
|
XDEF _lbnModQ_32
|
|
|
|
_lbnModQ_32:
|
|
|
|
move.l 4(sp),a0 * n
|
|
|
|
move.l d2,-(sp)
|
|
|
|
move.l d3,a4
|
|
|
|
moveq.l #0,d1
|
|
|
|
moveq.l #0,d2
|
|
|
|
move.w 12(sp),d1 * len
|
|
|
|
move.w 14(sp),d2 * d
|
|
|
|
** move.l 12(sp),d1 * len
|
|
|
|
** move.l 16(sp),d2 * d
|
|
|
|
lea -4(a0,d1.L*4),a0 * dc.w 0x41f0, 0x1cfc
|
|
|
|
|
|
|
|
* First time, divide 32/32 - may be faster than 64/32
|
|
|
|
move.l (a0),d3
|
|
|
|
divul.l d2,d0:d3 * dc.w 0x4c02, 0x3000
|
|
|
|
subq.w #2,d1
|
|
|
|
bmi mq32_done
|
|
|
|
|
|
|
|
mq32_loop:
|
|
|
|
move.l -(a0),d3
|
|
|
|
divu.l d2,d0:d3 * dc.w 0x4c02,0x3400
|
|
|
|
dbra d1,mq32_loop
|
|
|
|
|
|
|
|
mq32_done:
|
|
|
|
move.l (sp)+,d2
|
|
|
|
move.l a4,d3
|
|
|
|
rts
|
|
|
|
|
|
|
|
end
|