# Assembly-language bignum primitives for the i960 Jx series. # # The Jx series is fairly straightforward single-instruction-issue # implementation, with a 1-cycle-issue 4-cycle-latency non-pipelined # multiplier that we can use. Note also that loads which hit in the # cache have 2 cycles of latency and stores stall until all pending # loads are done. # # What is intensely annoying about the i960 is that it uses the same # flags for all conditional branches (even compare-and-branch sets the # flags) AND for the carry bit. Further, it is hard to manipulate # that bit. # # Calling conventions: # The r registers are all local, if you set them up. There's an alternative # calling convention that uses bal (branch and link) and doesn't set them up. # Currently, all of these functions are designed to work that way. # g0-g7 are argument registers and volatile across calls. return in g0-g3. # g8-g11 are extra argument registers, and volatile if used, but # preserved if not. Here, they are not. # g12 is used for PIC, and is preserved. # g13 is a pointer to a structure return value, if used, and is volatile. # g14 is magic, and is used as a return address in the branch-and-link # convention, and as a pointer to an argument block if the arguments # won't fit in registers, but is usually hardwired 0 and must be # returned set to zero (0). # g15 is the frame pointer, and shouldn't be messed with. # The AC (condition codes) are all volatile. # The fp registers are all volatile, but irrelevant. # # BNWORD32 # lbnMultAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) # This adds "k" * "in" to "len" words of "out" and returns the word of # carry. # # For doing multiply-add, the 960 is a bit annoying because it uses # the same status bits for the carry flag and for the loop indexing # computation, and doesn't have an "add with carry out but not carry in" # instruction. Fortunately, we can arrange to have the loop indexing # leave the carry bit clear most of the time. # # The basic sequence of the loop is: # 1. Multiply k * *in++ -> high, low # 2. Addc carry word and carry bit to low # 3. Addc carry bit to high, producing carry word (note: cannot generate carry!) # 4. Addc low to *out++ # # Note that the carry bit set in step 4 is used in step 2. The only place # in this loop that the carry flag isn't in use is between steps 3 and 4, # so we have to rotate the loop to place the loop indexing operations here. # (Which consist of a compare-and-decrement and a conditional branch.) # The loop above ignores the details of when to do loads and stores, which # have some flexibility, but must be carefully scheduled to avoid stalls. # # The first iteration has no carry word in, so it requires only steps 1 and 4, # and since we begin the loop with step 4, it boils down to just step 1 # followed by the loop indexing (which clears the carry bit in preparation # for step 4). # # Arguments are passed as follows: # g0 - out pointer # g1 - in pointer # g2 - length # g3 - k # The other registers are used as follows. # g4 - low word of product # g5 - high word of product # g6 - current word of "out" # g7 - carry word # g13 - current word of "in" .globl _lbnMulAdd1_32 _lbnMulAdd1_32: ld (g1),g13 # Fetch *in addo g1,4,g1 # Increment in emul g13,g3,g4 # Do multiply (step 1) ld (g0),g6 # Fetch *out chkbit 0,g2 # Check if loop counter was odd shro 1,g2,g2 # Divide loop counter by 2 mov g5,g7 # Move high word to carry bno ma_loop1 # If even, jump to ma_loop1 cmpo 0,g2 # If odd, was it 1 (now 0)? be ma_done # If equal (carry set), jump to ending code # Entered with carry bit clear ma_loop: ld (g1),g13 # Fetch *in addc g4,g6,g6 # Add low to *out (step 4), generate carry emul g13,g3,g4 # Do multiply (step 1) st g6,(g0) # Write out *out addo g0,4,g0 # Increment out addo g1,4,g1 # Increment in ld (g0),g6 # Fetch next *out addc g7,g4,g4 # Add carries to low (step 2) addc g5,0,g7 # Add carry bit to high (step 3) & clear carry ma_loop1: ld (g1),g13 # Fetch *in addc g4,g6,g6 # Add low to *out (step 4), generate carry emul g13,g3,g4 # Do multiply (step 1) st g6,(g0) # Write out *out addo g0,4,g0 # Increment out addo g1,4,g1 # Increment in ld (g0),g6 # Fetch next *out addc g7,g4,g4 # Add carries to low (step 2) addc g5,0,g7 # Add carry bit to high (step 3) & clear carry cmpdeco 1,g2,g2 bne ma_loop # When we come here, carry is *set*, and we stil have to do step 4 ma_done: cmpi 0,1 # Clear carry (equal flag) addc g4,g6,g6 # Add low to *out (step 4), generate carry st g6,(g0) # Write out *out addc g7,0,g0 # Add carry bit and word to produce return value ret # Now, multiply N by 1 is similarly annoying. We only have one add in the # whole loop, which should just be able to leave its carry output in the # carry flag for the next iteration, but we need the condition codes to do # loop testing. *Sigh*. # # void # lbnMultN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) # This stores len+1 words of "k" * len words of "in" and stores the result # in "out". # # To avoid having to do a move after the first iteration, for the first # step, g4/g5 is the product. For second step, g6/g7 is used for product # storage and g5 is the carry in. It alternates from then on. .globl _lbnMulN1_32 _lbnMulN1_32: ld (g1),g13 # Fetch *in addo g1,4,g1 # Increment in emul g13,g3,g4 # Do multiply (step 1) chkbit 0,g2 # Check if loop counter was odd shro 1,g2,g2 # Divide loop counter by 2 bno m_loop1 # If even, jump to ma_loop1 mov g4,g6 cmpo 0,g2 # If counter was odd, was it 1 (now 0)? mov g5,g7 be m_done # If equal (carry set), jump to ending code # Entered with carry bit clear m_loop: # Result in g6, carry word in g7 ld (g1),g13 # Fetch *in addo g1,4,g1 # Increment in emul g13,g3,g4 # Do multiply (step 1) st g6,(g0) # Write out *out addo g0,4,g0 # Increment out addc g7,g4,g4 # Add carries to low (step 2) # No need to add carry bit here, because it'll get remembered until next addc. # addc g5,0,g5 # Add carry bit to high (step 3) m_loop1: # Carry word in g5 ld (g1),g13 # Fetch *in addo g1,4,g1 # Increment in emul g13,g3,g6 # Do multiply (step 1) st g4,(g0) # Write out *out addo g0,4,g0 # Increment out addc g5,g6,g6 # Add carries to low (step 2) addc g7,0,g7 # Add carry bit to high (step 3) cmpdeco 1,g2,g2 bne m_loop # When we come here, we have to store g6 and the carry word in g7. m_done: st g6,(g0) # Write out *out st g7,4(g0) # Write out *out ret # BNWORD32 # lbnMultSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) # This subtracts "k" * "in" from "len" words of "out" and returns the word of # borrow. # # This is similar to multiply-add, but actually a bit more obnoxious, # because of the carry situation. The 960 uses a carry (rather than a borrow) # bit on subtracts, so the carry bit should be 1 for a subc to do the # same thing as an ordinary subo. So we use two carry chains: one from # the add of the low-order words to the high-order carry word, and a second, # which uses an extra register, to connect the subtracts. This avoids # the need to fiddle with inverting the bit in the usual case. # # Arguments are passed as follows: # g0 - out pointer # g1 - in pointer # g2 - length # g3 - k # The other registers are used as follows. # g4 - low word of product # g5 - high word of product # g6 - current word of "out" # g7 - carry word # g13 - current word of "in" # g14 - remembered carry bit .globl _lbnMulSub1_32 _lbnMulSub1_32: ld (g1),g13 # Fetch *in addo g1,4,g1 # Increment in emul g13,g3,g4 # Do multiply (step 1) ld (g0),g6 # Fetch *out chkbit 0,g2 # Check if loop counter was odd mov 1,g14 # Set remembered carry for first iteration shro 1,g2,g2 # Divide loop counter by 2 mov g5,g7 # Move high word to carry bno ms_loop1 # If even, jump to ma_loop1 cmpo 0,g2 # If odd, was it 1 (now 0)? be ms_done # If equal (carry set), jump to ending code # Entered with carry bit clear ms_loop: ld (g1),g13 # Fetch *in cmpi g14,1 # Set carry flag subc g4,g6,g6 # Subtract low from *out (step 4), gen. carry emul g13,g3,g4 # Do multiply (step 1) addc 0,0,g14 # g14 = carry, then clear carry st g6,(g0) # Write out *out addo g0,4,g0 # Increment out addo g1,4,g1 # Increment in ld (g0),g6 # Fetch next *out addc g7,g4,g4 # Add carries to low (step 2) addc g5,0,g7 # Add carry bit to high (step 3) ms_loop1: ld (g1),g13 # Fetch *in cmpi g14,1 # Set carry flag for subtrsct subc g4,g6,g6 # Subtract low from *out (step 4), gen. carry emul g13,g3,g4 # Do multiply (step 1) addc 0,0,g14 # g14 = carry, then clear carry st g6,(g0) # Write out *out addo g0,4,g0 # Increment out addo g1,4,g1 # Increment in ld (g0),g6 # Fetch next *out addc g7,g4,g4 # Add carries to low (step 2) addc g5,0,g7 # Add carry bit to high (step 3) cmpdeco 1,g2,g2 bne ms_loop # When we come here, carry is *set*, and we stil have to do step 4 ms_done: cmpi g14,1 # set carry (equal flag) subc g4,g6,g6 # Add low to *out (step 4), generate carry st g6,(g0) # Write out *out subc 0,0,g14 # g14 = -1 if no carry (borrow), 0 if carry subo g14,g7,g0 # Add borrow bit to produce return value mov 0,g14 # Restore g14 to 0 for return ret