Difference between revisions of "Z80:Math Routines"

From Learn @ Cemetech
Jump to navigationJump to search
(→‎PseudoRandByte_0: changed into a section of RNGs)
(→‎gcdHL_DE: copied wrong code first time, added 8-bit version)
 
(14 intermediate revisions by the same user not shown)
Line 3: Line 3:
  
 
= Multiplication =
 
= Multiplication =
== DE_Times_A ==
+
== DE_Times_A, 16-bit output ==
 
At 13 bytes, this code is a pretty decent balance of speed and size. It multiplies DE by A and returns a 16-bit result in HL.
 
At 13 bytes, this code is a pretty decent balance of speed and size. It multiplies DE by A and returns a 16-bit result in HL.
  
 
     DE_Times_A:
 
     DE_Times_A:
     ;Inputs:
+
     ;Inputs: DE,A
    ;    DE and A are factors
+
     ;Outputs: HL is product, B=0, A,C,DE preserved
     ;Outputs:
+
     ;342cc~390cc, avg= 366cc
    ;    A is not changed
+
     ;size: 13 bytes
    ;    B is 0
+
        ld b,8
    ;    C is not changed
+
        ld hl,0
    ;    DE is not changed
+
    _:
     ;     HL is the product
+
         add hl,hl \ rlca \ jr nc,$+3 \ add hl,de \ djnz -_
     ;Time:
+
        ret
    ;    342+6x
 
    ;
 
        ld b,8         ;7          7
 
        ld hl,0        ;10        10
 
          add hl,hl     ;11*8      88
 
          rlca         ;4*8        32
 
          jr nc,$+3     ;(12|18)*8  96+6x
 
            add hl,de   ;--        --
 
          djnz $-5      ;13*7+8    99
 
        ret             ;10        10
 
  
 +
== DE_Times_A, 24-bit output ==
 +
This version takes only a minor tweak to return the full 24-bit result.
 +
    DE_Times_A:
 +
    ;Inputs: DE,A
 +
    ;Outputs: A:HL is product, BC=0,DE preserved
 +
    ;343cc~423cc, avg= 383cc
 +
    ;size: 14 bytes
 +
        ld bc,0800h
 +
        ld h,c
 +
        ld l,c
 +
    _:
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
        djnz -_
 +
        ret
  
== DE_Times_A ==_Unrolled
+
== DE_Times_A, Unrolled, 16-bit output ==
 
Unrolled routines are larger in most cases, but they can really save on speed. This is 25% faster at its slowest, 40% faster at its fastest:
 
Unrolled routines are larger in most cases, but they can really save on speed. This is 25% faster at its slowest, 40% faster at its fastest:
  
 +
    DE_Times_A:
 +
    ;Inputs: DE,A
 +
    ;Outputs: A:HL is product, BC,DE preserved
 +
    ;min: 203cc
 +
    ;max: 268cc
 +
    ;avg: 235cc
 +
    ;size: 43 bytes
 +
 +
        ld hl,0  \ rlca \ jr nc,$+5 \ ld h,d \ ld e,l
 +
        add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
 +
        add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
 +
        add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
 +
        add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
 +
        add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
 +
        add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
 +
        add hl,hl \ rlca \ ret nc \ add hl,de \ ret
  
    ;===============================================================
+
== DE_Times_A, unrolled, 24-bit output ==
 +
This version takes only a minor tweak to return the full 24-bit result.
 +
This is roughly 34% faster, being unrolled.
 
     DE_Times_A:
 
     DE_Times_A:
    ;===============================================================
+
     ;Inputs: DE,A
     ;Inputs:
+
     ;Outputs: A:HL is product, C=0, B,DE preserved
    ;    DE and A are factors
+
     ;207cc~300cc, avg= 253.5cc
     ;Outputs:
+
     ;size: 14 bytes
    ;    A is unchanged
+
        ld hl,0
    ;    BC is unchanged
+
        ld c,h
    ;    DE is unchanged
+
        ;or a     Uncomment to allow early exit if A=0
    ;    HL is the product
+
        ;ret z
     ;speed: min 199 cycles
+
                add a,a \ jr nc,$+5 \ ld h,d \ ld l,e
    ;      max 261 cycles
+
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
    ;        212+6b cycles +15 if odd, -11 if non-negative
+
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
    ;=====================================Cycles====================
+
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
     ;1
+
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
        ld hl,0           ;210000          10     10
+
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
        rlca                  ;07            4
+
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
        jr nc,$+5 \ ld h,d \ ld e,l  ;3002626B  12+14p
+
        add hl,hl \ rla \ ret nc \ add hl,de \ adc a,c
    ;2
+
        ret
        add hl,hl             ;29            --
 
        rlca                  ;07            4
 
        jr nc,$+3 \ add hl,de ;300119    12+6b
 
    ;3
 
        add hl,hl             ;29            11
 
        rlca                  ;07            4
 
        jr nc,$+3 \ add hl,de ;300119    12+6b
 
    ;4
 
        add hl,hl             ;29            11
 
        rlca                  ;07            4
 
        jr nc,$+3 \ add hl,de ;300119    12+6b
 
    ;5
 
        add hl,hl             ;29            11
 
        rlca                  ;07            4
 
        jr nc,$+3 \ add hl,de ;300119    12+6b
 
    ;6
 
        add hl,hl             ;29            11
 
        rlca                  ;07            4
 
        jr nc,$+3 \ add hl,de ;300119    12+6b
 
    ;7
 
        add hl,hl             ;29            11
 
        rlca                  ;07            4
 
        jr nc,$+3 \ add hl,de ;300119    12+6b
 
    ;8
 
        add hl,hl             ;29            11
 
        rlca                  ;07            4
 
        ret nc                 ;D0        11-6b
 
        add hl,de             ;300119    12+6b
 
        ret                   ;C9            10
 
  
 +
== B_Times_DE, 16-bit output ==
 +
This routine removes leading zeroes before finishing the multiplication, which take a little more code, but results in a faster average speed.
  
 +
    B_Times_DE:
 +
    ;Inputs: A,DE
 +
    ;Outputs: HL=product, B=0, A=input B, C,DE unafected
 +
    ;22 bytes
 +
    ;B=0:    26cc
 +
    ;B=1:    201cc
 +
    ;B>1:    219cc~339cc
 +
    ;avg=319.552734375cc    (319+283/512)
  
== A_Times_DE ==
+
        ld hl,0
 +
        or b
 +
        ret z
 +
        ld b,8
 +
        rlca
 +
        dec b
 +
        jr nc,$-2
 +
        ld h,d
 +
        ld l,e
 +
        ret z
 +
    _:
 +
        add hl,hl
 +
        rlca
 +
        jr nc,$+3
 +
        add hl,de
 +
        djnz -_
 +
        ret
 +
 
 +
== B_Times_DE, 24-bit output ==
 +
This routine removes leading zeroes before finishing the multiplication, which take a little more code, but
 
This routine uses another clever way of optimizing for speed without unrolling. The result is slightly larger and a bit faster. The idea here is to remove leading zeros before multiplying.
 
This routine uses another clever way of optimizing for speed without unrolling. The result is slightly larger and a bit faster. The idea here is to remove leading zeros before multiplying.
  
  
     A_Times_DE:
+
     B_Times_DE:
     ;211 for times 1
+
     ;Inputs: A,DE
    ;331 tops
+
     ;Outputs: HL=product, B=0, A=input B, C,DE unafected
     ;Outputs:
 
    ;    HL is the product
 
    ;    B is 0
 
    ;    A,C,DE are not changed
 
    ;    z flag set
 
    ;
 
        ld hl,0
 
        or a
 
        ld b,h    ;remove this if you don't need b=0 for output. Saves 4 cycles, 1 byte
 
        ret z
 
        ld b,9
 
          rlca
 
          dec b
 
          jr nc,$-2
 
    Loop1:
 
        add hl,de
 
    Loop2:
 
        dec b
 
        ret z
 
        add hl,hl
 
        rlca
 
        jp c,Loop1      ;21|20
 
        jp Loop2
 
 
     ;22 bytes
 
     ;22 bytes
 +
        ld hl,0
 +
        or b
 +
        ret z
 +
        ld bc,800h
 +
        rla
 +
        dec b
 +
        jr nc,$-2
 +
        ld h,d
 +
        ld l,e
 +
        ret z
 +
    _:
 +
        add hl,hl
 +
        rla
 +
        jr nc,$+3
 +
        add hl,de
 +
        adc a,c
 +
        djnz -_
 +
        ret
  
 
+
== DE_Times_BC, 32-bit result==
== DE_Times_BC ==
 
  
 
     DE_Times_BC:
 
     DE_Times_BC:
Line 124: Line 140:
 
     ;    A is 0
 
     ;    A is 0
 
     ;    BC is not changed
 
     ;    BC is not changed
     ;    DEHL is the product
+
     ;    DE:HL is the product
     ;
+
     ;902cc~1206cc, avg=1050cc
 +
    ;20 bytes
 
           ld hl,0
 
           ld hl,0
 
           ld a,16
 
           ld a,16
Line 138: Line 155:
 
             jr nz,Mul_Loop_1
 
             jr nz,Mul_Loop_1
 
           ret
 
           ret
 +
 +
== DE_Times_BC, pseudo-unrolled result==
 +
For 5 bytes more, you can cut out 93cc from the average. This method somehat unrolls the code by making the iterated code into a subroutine, then place a call to that subroutine that falls through to the same subroutine, essentially iterating it twice. Now do the same to this subroutine, and you have iterated 4 times, do this twice more to get the 16 iterations we need.
 +
    DE_Times_BC:
 +
    ;Inputs:
 +
    ;    DE and BC are factors
 +
    ;Outputs:
 +
    ;    A,BC are not changed
 +
    ;    DE:HL is the product
 +
    ;25 bytes
 +
    ;873cc~1289cc, avg=957cc
 +
        ld hl,0
 +
        call +_
 +
    _:
 +
        call +_
 +
    _:
 +
        call +_
 +
    _:
 +
        call +_
 +
    _:
 +
    ;38cc or 54cc or 64cc, avg=43.25
 +
        add hl,hl
 +
        rl e \ rl d
 +
        ret nc
 +
        add hl,bc
 +
        ret nc
 +
        inc de
 +
        ret
 +
 +
== BC_Times_DE, fully unrolled ==
 +
This is very fast, averaging less than 600cc. 38% and 43% faster than the pseudo-unrolled and regular routine, but 123 bytes.
 +
    BC_Times_DE:
 +
    ;BC*DE->BCHL
 +
    ;out: E=0, A,D are destroyed
 +
    ;Assuming B==0,C==0    128cc
 +
    ;Assuming B==0,C!=0    317cc~414cc, avg 365.5c
 +
    ;        B!=0,C==0    317cc~422cc, avg 367.5c
 +
    ;Assuming B!=0,C!=0    527cc~695cc, avg 598.5
 +
    ;Overall average: 78209011/131072=596.68740081787109375
 +
    ;123 bytes
 +
        ld a,b
 +
        ld hl,0
 +
        ld b,h
 +
        or a
 +
        jr z,+_
 +
                add a,a \ jr nc,$+5 \ ld h,d \ ld l,e
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
 +
    _:
 +
        push hl
 +
        ld h,b
 +
        ld l,b
 +
        ld b,a
 +
        ld a,c
 +
        ld c,b
 +
        or a
 +
        jr z,+_
 +
                add a,a \ jr nc,$+5 \ ld h,d \ ld l,e
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
        add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
 +
    _:
 +
        pop de
 +
        ld c,d
 +
        ld d,e
 +
        ld e,0
 +
        add hl,de
 +
        ret nc
 +
        adc a,c
 +
        ld c,a
 +
        ret nc
 +
        inc b
 +
        ret
 +
source: [http://pastebin.com/1HZq5vnn Zeda's Pastebin/BC_Times_DE]
  
 
== C_Times_D ==
 
== C_Times_D ==
Line 158: Line 258:
  
  
== D_Times_C ==
+
== mul32, 64-bit output ==
This routine returns a 16-bit value with C as the overflow.
+
With these sizes, we need to use RAM to hold intermediate values.
 +
'''mul16 needs to perform DE*BC => DEHL'''
 +
    mul32:
 +
    ;;uses karatsuba multiplication
 +
    ;;var_x * var_y
 +
    ;;z0 holds the 64-bit result
 +
    ;;708cc+6a+13b+42c +3mul
 +
    ;;Avg: 2464.110153
 +
    ;;Max:2839cc, 92cc faster
 +
    ;;Min:2178cc (early can make it faster, though), 167cc faster
 +
        ld de,(var_x)  ;\
 +
        ld bc,(var_y)  ; |compute z0,z2
 +
        push bc        ; | var_y
 +
        call mul16      ; |
 +
        ld (var_z0),hl  ; |
 +
        ld bc,(var_y+2) ; |
 +
        ld (var_z0+2),de; |
 +
        ld de,(var_x+2) ; |
 +
        push bc        ; | var_y+2
 +
        call mul16      ; |
 +
        ld (var_z2),hl  ; |
 +
        ld (var_z2+2),de;/      208cc
 +
        xor a           ;\
 +
        ld hl,(var_x)  ; |
 +
        ld de,(var_x+2) ; |
 +
        add hl,de      ; |
 +
        rra            ; |
 +
        pop de          ; |
 +
        ex (sp),hl      ; |
 +
        add hl,de      ; |
 +
        pop bc          ; |
 +
        ex de,hl        ; |    109cc
 +
        push de        ; |if bit0=1, add DE<<16 to result
 +
        push bc        ; |
 +
        push af        ; |c flag means add BC<<16 to result
 +
        call mul16      ; |
 +
        ex de,hl        ; |
 +
        pop af          ; |
 +
        pop bc          ; |
 +
        jr nc,$+3      ; |    86+6a
 +
        add hl,bc      ; |
 +
        pop bc          ; |
 +
        rla            ; |
 +
        jr nc,$+4      ; |    26+13b
 +
        add hl,bc      ; |
 +
        adc a,0        ; |z1 = AHLDE-z2-z1
 +
        ex de,hl \ ld bc,(var_z0) \ sbc hl,bc
 +
        ex de,hl \ ld bc,(var_z0+2) \ sbc hl,bc
 +
        sbc a,0
 +
        ex de,hl \ ld bc,(var_z2) \ sbc hl,bc
 +
        ex de,hl \ ld bc,(var_z2+2) \ sbc hl,bc
 +
        sbc a,0        ; |z1 = AHLDE
 +
        ld b,h \ ld c,l ;/ z1 = ABCDE
 +
        ld hl,(var_z0+2);\
 +
        add hl,de      ; |Add:
 +
        ld (var_z0+2),hl; |z2z0
 +
        ld hl,(var_z2)  ; | z1
 +
        adc hl,bc      ; |----
 +
        ld (var_z2),hl  ; |
 +
        ret nc          ; |    279+42c
 +
        ld hl,(var_z2+2); |
 +
        inc hl          ; |
 +
        ld (var_z2+2),hl; |
 +
        ret            ;/
 +
source [http://pastebin.com/PT3qqGFu Zeda's Pastebin/mul32]
  
 
+
== BCDE_Times_A ==
    ;Returns a 16-bit result
+
     BCDE_Times_A:
    ;
+
     ;Inputs: BC:DE,A
    ;===============================================================
+
     ;Outputs: A:HL:IX is the 40-bit product, BC,DE unaffected
     D_Times_C:
+
     ;503cc~831cc
    ;===============================================================
+
     ;667cc average
     ;Inputs:
+
     ;29 bytes
    ;    D and C are factors
+
         ld ix,0
     ;Outputs:
+
        ld hl,0
    ;    A is the product (lower 8 bits)
+
        call +_
     ;     B is 0
+
     _:
     ;     C is the overflow (upper 8 bits)
+
         call +_
    ;    DE, HL are not changed
+
     _:
     ;Size:  15 bytes
+
         call +_
    ;Speed: 312+12z-y
+
     _:
    ;    See Speed Summary below
+
        add ix,ix \ adc hl,hl \ rla \ ret nc
    ;===============================================================
+
        add ix,de \ adc hl,bc \ adc a,0
        xor a         ;This is an optimised way to set A to zero. 4 cycles, 1 byte.
+
        ret
        ld b,8        ;Number of bits in E, so number of times we will cycle through
+
source: [http://pastebin.com/yzFsHMqn Zeda's Pastebin/BCDE_Times_A]
    Loop:
 
        add a,a      ;We double A, so we shift it left. Overflow goes into the c flag.
 
        rl c          ;Rotate overflow in and get the next bit of C in the c flag
 
        jr nc,$+6    ;If it is 0, we don't need to add anything to A
 
          add a,d    ;Since it was 1, we do A+1*D
 
          jr nc,$+3  ;Check if there was overflow
 
            inc c    ;If there was overflow, we need to increment E
 
        djnz Loop     ;Decrements B, if it isn't zero yet, jump back to Loop:
 
        ret
 
   
 
   
 
   
 
    ;Speed Summary
 
    ;    xor a         ;  4
 
     ;    ld b,8        ;  7
 
    ;Loop:             ;
 
    ;    add a,a      ; 32
 
    ;    rl c          ; 64
 
    ;    jr nc,$+6    ; 96+12z-y  z=number of bits in C, y is overflow, so at most one less than z
 
    ;      add a,d    ; --
 
    ;      jr nc,$+3  ; --
 
    ;         inc c    ; --
 
    ;    djnz Loop    ; 99
 
    ;    ret          ; 10
 
    ;
 
    ;312+12z-y
 
    ;    z is the number of bits in C
 
    ;    y is the number of overflows in the branch. This is at most z-1.
 
     ;Max: 415 cycles
 
 
 
 
 
== DEHL_Mul_IXBC ==
 
32-bit multiplication
 
 
 
    ;***********
 
    ;**  RAM  **
 
    ;***********
 
    ;This uses Self Modifying Code to get a speed boost. This must be
 
    ;used from RAM.
 
    DEHL_Mul_IXBC:
 
    ;Inputs:
 
    ;    DEHL
 
    ;    IXBC
 
    ;Outputs:
 
    ;    AF is the return address
 
    ;    IXHLDEBC is the result
 
    ;    4 bytes at TempWord1 contain the upper 32-bits of the result
 
    ;    4 bytes at TempWord3 contain the value of the input stack values
 
    ;Comparison/Perspective:
 
    ;    At 6MHz, this can be executed at the slowest more than 726
 
    ;    times per second.
 
    ;    At 15MHz, this can be executed at the slowest more than
 
    ;    1815 times per second.
 
    ;===============================================================
 
        ld (TempWord1),hl
 
        ld (TempWord2),de
 
        ld (TempWord3),bc
 
        ld (TempWord4),ix
 
        ld a,32
 
        ld bc,0
 
        ld d,b \ ld e,b
 
    Mult32StackLoop:
 
        sla c \ rl b \ rl e \ rl d
 
        .db 21h                    ;ld hl,**
 
    TempWord1:
 
        .dw 0
 
        adc hl,hl
 
        .db 21h                    ;ld hl,**
 
    TempWord2:
 
        .dw 0
 
        adc hl,hl
 
        jr nc,OverFlowDone
 
          .db 21h
 
    TempWord3:
 
          .dw 0
 
          add hl,bc
 
          ld b,h \ ld c,l
 
          .db 21h
 
    TempWord4:
 
          .dw 0
 
          adc hl,de
 
          ex de,hl
 
          jr nc,OverFlowDone
 
            ld hl,TempWord1
 
            inc (hl) \ jr nz,OverFlowDone
 
            inc hl \ inc (hl) \ jr nz,OverFlowDone
 
            inc hl \ inc (hl) \ jr nz,OverFlowDone
 
            inc hl \ inc (hl)
 
    OverFlowDone:
 
        dec a
 
        jr nz,Mult32StackLoop
 
        ld ix,(TempWord2)
 
        ld hl,(TempWord1)
 
        ret
 
 
 
 
 
== DEHL_Times_A ==
 
 
 
    ;===============================================================
 
    DEHL_Times_A:
 
    ;===============================================================
 
    ;Inputs:
 
    ;    DEHL is a 32 bit factor
 
    ;    A is an 8 bit factor
 
    ;Outputs:
 
    ;    interrupts disabled
 
    ;    BC is not changed
 
    ;    AHLDE is the 40-bit result
 
    ;    D'E' is the lower 16 bits of the input
 
    ;    H'L' is the lower 16 bits of the output
 
    ;    B' is 0
 
    ;    C' is not changed
 
    ;    A' is not changed
 
    ;===============================================================
 
        di
 
        push hl
 
        or a
 
        sbc hl,hl
 
        exx
 
        pop de
 
        sbc hl,hl
 
        ld b,8
 
    mul32Loop:
 
          add hl,hl
 
          rl e \ rl d
 
          add a,a
 
          jr nc,$+8
 
            add hl,de
 
            exx
 
            adc hl,de
 
            inc a
 
            exx
 
          djnz mul32Loop
 
          push hl
 
          exx
 
          pop de
 
          ret
 
  
 
== H_Times_E ==
 
== H_Times_E ==
Line 438: Line 465:
 
Here are a handful of optimised routines for the absolute value of a number:
 
Here are a handful of optimised routines for the absolute value of a number:
  
 
+
== absHL ==
 
     absHL:
 
     absHL:
 
         bit 7,h
 
         bit 7,h
Line 445: Line 472:
 
         sbc a,a \ sub h \ ld h,a
 
         sbc a,a \ sub h \ ld h,a
 
         ret
 
         ret
 +
 +
== absDE ==
 
     absDE:
 
     absDE:
 
         bit 7,d
 
         bit 7,d
Line 451: Line 480:
 
         sbc a,a \ sub d \ ld d,a
 
         sbc a,a \ sub d \ ld d,a
 
         ret
 
         ret
 +
== absBC ==
 
     absBC:
 
     absBC:
 
         bit 7,b
 
         bit 7,b
Line 457: Line 487:
 
         sbc a,a \ sub b \ ld b,a
 
         sbc a,a \ sub b \ ld b,a
 
         ret
 
         ret
 +
== absA ==
 
     absA:
 
     absA:
 
         or a
 
         or a
 
         ret p
 
         ret p
         neg         ;or you can use      cpl \ inc a
+
         neg
 
         ret
 
         ret
 
+
== abs[reg8] ==
 +
    abs[reg8]:
 +
        xor a
 +
        sub [reg8]
 +
        ret m
 +
        ld [reg8],a
 +
        ret
  
 
= Division =
 
= Division =
Line 491: Line 528:
 
         ret
 
         ret
  
== DE_Div_BC ==
+
== BC_Div_DE ==
This divides DE by BC, storing the result in DE, remainder in HL
+
This divides BC by DE, storing the result in AC, remainder in HL
 
 
    DE_Div_BC:          ;1281-2x, x is at most 16
 
        ld a,16        ;7
 
        ld hl,0        ;10
 
        jp $+5        ;10
 
    DivLoop:
 
          add hl,bc    ;--
 
          dec a        ;64
 
          ret z        ;86
 
   
 
          sla e        ;128
 
          rl d        ;128
 
          adc hl,hl    ;240
 
          sbc hl,bc    ;240
 
          jr nc,DivLoop ;23|21
 
          inc e        ;--
 
          jp DivLoop+1
 
  
 +
    BC_Div_DE:
 +
    ;Inputs: BC,DE
 +
    ;Outputs: DE unaffected, HL is remainder, AC is quotient, B=0
 +
    ;20 bytes
 +
    ;1098cc~1258cc, avg=1178cc
 +
        ld hl,0
 +
        ld a,b
 +
        ld b,16
 +
    _:
 +
        sll c \ rla \ adc hl,hl \ sbc hl,de \ jr nc,$+4 \ add hl,de \ dec c
 +
        djnz -_
 +
        ret
  
 
== DEHL_Div_C ==
 
== DEHL_Div_C ==
Line 1,096: Line 1,128:
  
  
= GCDHL_BC =
+
= gcdHL_DE =
This computes the Greatest Common Divisor of HL and BC:
+
This computes the Greatest Common Divisor of HL and DE, using the binary gcd algorithm.
  
     GCDHL_BC:
+
     gcdHL_DE:
     ;Inputs:
+
     ;gcd(HL,DE)->HL
     ;     HL,BC
+
     ;binary GCD algorithm
    ;Outputs:
+
        ld a,h \ or l \ ret z
    ;    A is 0
+
        ex de,hl
    ;    BC,DE are both the GCD
+
        ld a,h \ or l \ ret z
    ;    HL is 0
+
        sbc hl,de
        ld a,16
+
        add hl,de
        ld de,0
+
        ret z
        add hl,hl
+
        ld b,1
        ex de,hl
+
        ld a,e \ or l \ rra \ jr c,+_
        adc hl,hl
+
        inc b
        or a
+
        rr h \ rr l
        sbc hl,bc
+
        rr d \ rr e
        jr c,$+3
+
        ld a,e \ or l \ rra \ jr nc,$-12
        add hl,bc
+
    _:
        ex de,hl
+
        srl h \ rr l \ jr nc,$-4 \ adc hl,hl
        dec a
+
        ex de,hl
        jr nz,GCDHL_BC+5
+
    _:
        ld h,b
+
        srl h \ rr l \ jr nc,$-4 \ adc hl,hl
        ld l,c
+
        xor a \ sbc hl,de
        ld b,d
+
        jr z,+_
        ld c,e
+
        jr nc,-_ \ sub l \ ld l,a \ sbc a,a \ sub h \ ld h,a
        ld a,d \ or e
+
        jp -_-1
        jr nz,GCDHL_BC
+
    _:
        ret
+
        ex de,hl
 +
        dec b
 +
        ret z
 +
        add hl,hl
 +
        djnz $-1
 +
        ret
  
 
= Modulus =
 
= Modulus =
 +
== DEHL_mod_3 ==
 +
See below.
 +
== HLDE_mod_3 ==
 +
See below.
 +
== HL_mod_3 ==
 +
See below.
 
== A_mod_3 ==
 
== A_mod_3 ==
 
Computes A mod 3 (essentially, the remainder of A after division by 3):
 
Computes A mod 3 (essentially, the remainder of A after division by 3):
 +
 +
    DEHL_mod_3:
 +
    HLDE_mod_3:
 +
    ;Inputs:
 +
    ;  HL:DE  unsigned integer
 +
    ;Outputs:
 +
    ;  A = HLDE mod 3
 +
    ;  Z flag is set if divisible by 3
 +
    ;Destroys:
 +
    ;  C
 +
    ; 27 bytes, 103cc,118cc,104cc,119cc, avg=114.75cc
 +
        add hl,de
 +
        jr nc,$+3
 +
        dec hl
 +
    HL_mod_3:
 +
    ;Inputs:
 +
    ;  HL  unsigned integer
 +
    ;Outputs:
 +
    ;  A = HL mod 3
 +
    ;  Z flag is set if divisible by 3
 +
    ;Destroys:
 +
    ;  C
 +
    ; 23 bytes, 80cc or 95cc, avg 91.25cc
 +
        ld a,h
 +
        add a,l
 +
        sbc a,0  ;conditional decrement
 +
    A_mod_3:
 
     ;Inputs:  
 
     ;Inputs:  
 
     ;  A  unsigned integer  
 
     ;  A  unsigned integer  
Line 1,136: Line 1,206:
 
     ;Destroys:  
 
     ;Destroys:  
 
     ;  C  
 
     ;  C  
     ; 19 bytes,  ~75cc (65 or 80)
+
     ; 19 bytes,  65cc or 80cc, avg=76.25cc
    bMod3:
 
 
         ld c,a                    ;add nibbles   
 
         ld c,a                    ;add nibbles   
 
         rrca / rrca / rrca / rrca   
 
         rrca / rrca / rrca / rrca   
Line 1,151: Line 1,220:
 
         ret
 
         ret
 
source: [https://www.cemetech.net/forum/viewtopic.php?p=249602#249602 Cemetech/Fast 8-bit mod 3]
 
source: [https://www.cemetech.net/forum/viewtopic.php?p=249602#249602 Cemetech/Fast 8-bit mod 3]
 
== HL_mod_3 ==
 
 
    HL_mod_3:
 
    ;Outputs:
 
    ;    Preserves HL
 
    ;    A is the remainder
 
    ;    destroys DE,BC
 
    ;    z flag if divisible by 3, else nz
 
        ld bc,030Fh
 
        ld a,h
 
        add a,l
 
        sbc a,0  ;conditional decrement
 
    ;Now we need to add the upper and lower nibble in a
 
        ld d,a
 
        and c
 
        ld e,a
 
        ld a,d
 
        rlca
 
        rlca
 
        rlca
 
        rlca
 
        and c
 
   
 
        add a,e
 
        sub c
 
        jr nc,$+3
 
        add a,c
 
    ;add the lower half nibbles
 
   
 
        ld d,a
 
        sra d
 
        sra d
 
        and b
 
        add a,d
 
        sub b
 
        ret nc
 
        add a,b
 
        ret
 
    ;at most 132 cycles, at least 123
 
 
== DEHL_mod_3 ==
 
Same as HLDE_mod_3
 
== HLDE_mod_3 ==
 
 
    DEHL_mod_3:
 
    HLDE_mod_3:
 
    ;Outputs:
 
    ;    A is the remainder
 
    ;    destroys DE,BC
 
    ;    z flag if divisible by 3, else nz
 
        ld bc,030Fh
 
        add hl,de
 
        jr nc,$+3
 
        dec hl
 
        ld a,h
 
        add a,l
 
        sbc a,0  ;conditional decrement
 
    ;Now we need to add the upper and lower nibble in a
 
        ld d,a
 
        and c
 
        ld e,a
 
        ld a,d
 
        rlca
 
        rlca
 
        rlca
 
        rlca
 
        and c
 
   
 
        add a,e
 
        sub c
 
        jr nc,$+3
 
        add a,c
 
    ;add the lower half nibbles
 
   
 
        ld d,a
 
        sra d
 
        sra d
 
        and b
 
        add a,d
 
        sub b
 
        ret nc
 
        add a,b
 
        ret
 
    ;at most 156 cycles, at least 146
 
  
  
Line 1,249: Line 1,233:
 
         djnz Loop
 
         djnz Loop
 
         ret
 
         ret
 
 
  
 
= Random Numbers =
 
= Random Numbers =
Line 1,402: Line 1,384:
 
         ret
 
         ret
 
source: [http://pastebin.com/5UWFemuu Zeda's Pastebin/rand]
 
source: [http://pastebin.com/5UWFemuu Zeda's Pastebin/rand]
 
= PseudoRandWord_0: =
 
Similar to the PseudoRandByte_0, this generates a a sequence of pseudo-random values that has a cycle of 65536 (so it will hit every single number):
 
 
    PseudoRandWord:
 
    ;f(n+1)=241f(n)+257  ;65536
 
    ;181 cycles, add 17 if called
 
    ;Outputs:
 
    ;    BC was the previous pseudorandom value
 
    ;    HL is the next pseudorandom value
 
    ;Notes:
 
    ;    You can also use B,C,H,L as pseudorandom 8-bit values
 
    ;    this will generate all 8-bit values
 
        .db 21h    ;start of ld hl,**
 
    randSeed:
 
        .dw 0
 
        ld c,l
 
        ld b,h
 
        add hl,hl
 
        add hl,bc
 
        add hl,hl
 
        add hl,bc
 
        add hl,hl
 
        add hl,bc
 
        add hl,hl
 
        add hl,hl
 
        add hl,hl
 
        add hl,hl
 
        add hl,bc
 
        inc h
 
        inc hl
 
        ld (randSeed),hl
 
        ret
 
  
 
= Fixed Point Math =
 
= Fixed Point Math =

Latest revision as of 22:30, 31 May 2016

If you want to make games, you will likely need math routines, if you are making a math program, you need math, and if you are making a utility, you will need math. You will need math in a good number of programs, so here are some routines that might prove useful.


Multiplication

DE_Times_A, 16-bit output

At 13 bytes, this code is a pretty decent balance of speed and size. It multiplies DE by A and returns a 16-bit result in HL.

   DE_Times_A:
   ;Inputs: DE,A
   ;Outputs: HL is product, B=0, A,C,DE preserved
   ;342cc~390cc, avg= 366cc
   ;size: 13 bytes
       ld b,8
       ld hl,0
   _:
       add hl,hl \ rlca \ jr nc,$+3 \ add hl,de \ djnz -_
       ret

DE_Times_A, 24-bit output

This version takes only a minor tweak to return the full 24-bit result.

   DE_Times_A:
   ;Inputs: DE,A
   ;Outputs: A:HL is product, BC=0,DE preserved
   ;343cc~423cc, avg= 383cc
   ;size: 14 bytes
       ld bc,0800h
       ld h,c
       ld l,c
   _:
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       djnz -_
       ret

DE_Times_A, Unrolled, 16-bit output

Unrolled routines are larger in most cases, but they can really save on speed. This is 25% faster at its slowest, 40% faster at its fastest:

   DE_Times_A:
   ;Inputs: DE,A
   ;Outputs: A:HL is product, BC,DE preserved
   ;min: 203cc
   ;max: 268cc
   ;avg: 235cc
   ;size: 43 bytes
       ld hl,0   \ rlca \ jr nc,$+5 \ ld h,d \ ld e,l
       add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
       add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
       add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
       add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
       add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
       add hl,hl \ rlca \ jr nc,$+3 \ add hl,de
       add hl,hl \ rlca \ ret nc \ add hl,de \ ret

DE_Times_A, unrolled, 24-bit output

This version takes only a minor tweak to return the full 24-bit result. This is roughly 34% faster, being unrolled.

   DE_Times_A:
   ;Inputs: DE,A
   ;Outputs: A:HL is product, C=0, B,DE preserved
   ;207cc~300cc, avg= 253.5cc
   ;size: 14 bytes
       ld hl,0
       ld c,h
       ;or a      Uncomment to allow early exit if A=0
       ;ret z
               add a,a \ jr nc,$+5 \ ld h,d \ ld l,e
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ ret nc \ add hl,de \ adc a,c
       ret

B_Times_DE, 16-bit output

This routine removes leading zeroes before finishing the multiplication, which take a little more code, but results in a faster average speed.

   B_Times_DE:
   ;Inputs: A,DE
   ;Outputs: HL=product, B=0, A=input B, C,DE unafected
   ;22 bytes
   ;B=0:    26cc
   ;B=1:    201cc
   ;B>1:    219cc~339cc
   ;avg=319.552734375cc    (319+283/512)
       ld hl,0
       or b
       ret z
       ld b,8
       rlca
       dec b
       jr nc,$-2
       ld h,d
       ld l,e
       ret z
   _:
       add hl,hl
       rlca
       jr nc,$+3
       add hl,de
       djnz -_
       ret

B_Times_DE, 24-bit output

This routine removes leading zeroes before finishing the multiplication, which take a little more code, but This routine uses another clever way of optimizing for speed without unrolling. The result is slightly larger and a bit faster. The idea here is to remove leading zeros before multiplying.


   B_Times_DE:
   ;Inputs: A,DE
   ;Outputs: HL=product, B=0, A=input B, C,DE unafected
   ;22 bytes
       ld hl,0
       or b
       ret z
       ld bc,800h
       rla
       dec b
       jr nc,$-2
       ld h,d
       ld l,e
       ret z
   _:
       add hl,hl
       rla
       jr nc,$+3
       add hl,de
       adc a,c
       djnz -_
       ret

DE_Times_BC, 32-bit result

   DE_Times_BC:
   ;Inputs:
   ;     DE and BC are factors
   ;Outputs:
   ;     A is 0
   ;     BC is not changed
   ;     DE:HL is the product
   ;902cc~1206cc, avg=1050cc
   ;20 bytes
          ld hl,0
          ld a,16
   Mul_Loop_1:
            add hl,hl
            rl e \ rl d
            jr nc,$+6
              add hl,bc
              jr nc,$+3
              inc de
            dec a
            jr nz,Mul_Loop_1
          ret

DE_Times_BC, pseudo-unrolled result

For 5 bytes more, you can cut out 93cc from the average. This method somehat unrolls the code by making the iterated code into a subroutine, then place a call to that subroutine that falls through to the same subroutine, essentially iterating it twice. Now do the same to this subroutine, and you have iterated 4 times, do this twice more to get the 16 iterations we need.

   DE_Times_BC:
   ;Inputs:
   ;     DE and BC are factors
   ;Outputs:
   ;     A,BC are not changed
   ;     DE:HL is the product
   ;25 bytes
   ;873cc~1289cc, avg=957cc
       ld hl,0
       call +_
   _:
       call +_
   _:
       call +_
   _:
       call +_
   _:
   ;38cc or 54cc or 64cc, avg=43.25
       add hl,hl
       rl e \ rl d
       ret nc
       add hl,bc
       ret nc
       inc de
       ret

BC_Times_DE, fully unrolled

This is very fast, averaging less than 600cc. 38% and 43% faster than the pseudo-unrolled and regular routine, but 123 bytes.

   BC_Times_DE:
   ;BC*DE->BCHL
   ;out: E=0, A,D are destroyed
   ;Assuming B==0,C==0     128cc
   ;Assuming B==0,C!=0     317cc~414cc, avg 365.5c
   ;         B!=0,C==0     317cc~422cc, avg 367.5c
   ;Assuming B!=0,C!=0     527cc~695cc, avg 598.5
   ;Overall average: 78209011/131072=596.68740081787109375
   ;123 bytes
       ld a,b
       ld hl,0
       ld b,h
       or a
       jr z,+_
               add a,a \ jr nc,$+5 \ ld h,d \ ld l,e
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
   _:
       push hl
       ld h,b
       ld l,b
       ld b,a
       ld a,c
       ld c,b
       or a
       jr z,+_
               add a,a \ jr nc,$+5 \ ld h,d \ ld l,e
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
   _:
       pop de
       ld c,d
       ld d,e
       ld e,0
       add hl,de
       ret nc
       adc a,c
       ld c,a
       ret nc
       inc b
       ret

source: Zeda's Pastebin/BC_Times_DE

C_Times_D

   C_Time_D:
   ;Outputs:
   ;     A is the result
   ;     B is 0
        ld b,8          ;7           7
        xor a           ;4           4
          rlca          ;4*8        32
          rlc c         ;8*8        64
          jr nc,$+3     ;(12|11)    96|88
            add a,d     ;--
          djnz $-6      ;13*7+8     99
        ret             ;10         10
   ;304+b (b is number of bits)
   ;308 is average speed.
   ;12 bytes


mul32, 64-bit output

With these sizes, we need to use RAM to hold intermediate values. mul16 needs to perform DE*BC => DEHL

   mul32:
   ;;uses karatsuba multiplication
   ;;var_x * var_y
   ;;z0 holds the 64-bit result
   ;;708cc+6a+13b+42c +3mul
   ;;Avg: 2464.110153
   ;;Max:2839cc, 92cc faster
   ;;Min:2178cc (early can make it faster, though), 167cc faster
       ld de,(var_x)   ;\
       ld bc,(var_y)   ; |compute z0,z2
       push bc         ; | var_y
       call mul16      ; |
       ld (var_z0),hl  ; |
       ld bc,(var_y+2) ; |
       ld (var_z0+2),de; |
       ld de,(var_x+2) ; |
       push bc         ; | var_y+2
       call mul16      ; |
       ld (var_z2),hl  ; |
       ld (var_z2+2),de;/      208cc
       xor a           ;\
       ld hl,(var_x)   ; |
       ld de,(var_x+2) ; |
       add hl,de       ; |
       rra             ; |
       pop de          ; |
       ex (sp),hl      ; |
       add hl,de       ; |
       pop bc          ; |
       ex de,hl        ; |     109cc
       push de         ; |if bit0=1, add DE<<16 to result
       push bc         ; |
       push af         ; |c flag means add BC<<16 to result
       call mul16      ; |
       ex de,hl        ; |
       pop af          ; |
       pop bc          ; |
       jr nc,$+3       ; |     86+6a
       add hl,bc       ; |
       pop bc          ; |
       rla             ; |
       jr nc,$+4       ; |     26+13b
       add hl,bc       ; |
       adc a,0         ; |z1 = AHLDE-z2-z1
       ex de,hl \ ld bc,(var_z0) \ sbc hl,bc
       ex de,hl \ ld bc,(var_z0+2) \ sbc hl,bc
       sbc a,0
       ex de,hl \ ld bc,(var_z2) \ sbc hl,bc
       ex de,hl \ ld bc,(var_z2+2) \ sbc hl,bc
       sbc a,0         ; |z1 = AHLDE
       ld b,h \ ld c,l ;/ z1 = ABCDE
       ld hl,(var_z0+2);\
       add hl,de       ; |Add:
       ld (var_z0+2),hl; |z2z0
       ld hl,(var_z2)  ; | z1
       adc hl,bc       ; |----
       ld (var_z2),hl  ; |
       ret nc          ; |     279+42c
       ld hl,(var_z2+2); |
       inc hl          ; |
       ld (var_z2+2),hl; |
       ret             ;/

source Zeda's Pastebin/mul32

BCDE_Times_A

   BCDE_Times_A:
   ;Inputs: BC:DE,A
   ;Outputs: A:HL:IX is the 40-bit product, BC,DE unaffected
   ;503cc~831cc
   ;667cc average
   ;29 bytes
       ld ix,0
       ld hl,0
       call +_
   _:
       call +_
   _:
       call +_
   _:
       add ix,ix \ adc hl,hl \ rla \ ret nc
       add ix,de \ adc hl,bc \ adc a,0
       ret

source: Zeda's Pastebin/BCDE_Times_A

H_Times_E

This is the fastest and smallest rolled 8-bit multiplication routine here, and it returns the full 16-bit result.

   H_Times_E:
   ;Inputs:
   ;     H,E
   ;Outputs:
   ;     HL is the product
   ;     D,B are 0
   ;     A,E,C are preserved
   ;Size:  12 bytes
   ;Speed: 311+6b, b is the number of bits set in the input HL
   ;      average is 335 cycles
   ;      max required is 359 cycles
        ld d,0     ;1600    7      7
        ld l,d     ;6A      4      4
        ld b,8     ;0608    7      7
                   ;            
        add hl,hl  ;29      11*8   88
        jr nc,$+3  ;3001 12*8-5b   96-5b
        add hl,de  ;19      11*b   11b
        djnz $-4   ;10FA  13*8-5   99
                   ;            
        ret        ;C9      10     10

== H_Times_E == (Unrolled)

   H_Times_E:
   ;Inputs:
   ;     H,E
   ;Outputs:
   ;     HL is the product
   ;     D,B are 0
   ;     A,E,C are preserved
   ;Size:  38 bytes
   ;Speed: 198+6b+9p-7s, b is the number of bits set in the input H, p is if it is odd, s is the upper bit of h
   ;   average is 226.5 cycles (108.5 cycles saved)
   ;   max required is 255 cycles (104 cycles saved)
        ld d,0      ;1600   7   7
        ld l,d      ;6A     4   4
        ld b,8      ;0608   7   7
              ;      
        sla h   ;   8
        jr nc,$+3   ;3001  12-b
        ld l,e   ;6B    --
   
        add hl,hl   ;29    11
        jr nc,$+3   ;3001  12+6b
        add hl,de   ;19    --
   
        add hl,hl   ;29    11
        jr nc,$+3   ;3001  12+6b
        add hl,de   ;19    --
   
        add hl,hl   ;29    11
        jr nc,$+3   ;3001  12+6b
        add hl,de   ;19    --
   
        add hl,hl   ;29    11
        jr nc,$+3   ;3001  12+6b
        add hl,de   ;19    --
   
        add hl,hl   ;29    11
        jr nc,$+3   ;3001  12+6b
        add hl,de   ;19    --
   
        add hl,hl   ;29    11
        jr nc,$+3   ;3001  12+6b
        add hl,de   ;19    --
   
        add hl,hl   ;29   11
        ret nc      ;D0   11+15p
        add hl,de   ;19   --
        ret         ;C9   --

L_Squared (fast)

The following provides an optimized algorithm to square an 8-bit number, but it only returns the lower 8 bits.

   L_sqrd:
   ;Input: L
   ;Output: L*L->A
   ;151 t-states
   ;37 bytes
   	ld h,l
   ;First iteration, get the lowest 3 bits
   	sla l
   	rr h
   	sbc a,a
   	or l
   ;second iteration, get the next 2 bits
   	ld c,a
   	rr h
   	sbc a,a
   	xor l
   	and $F8
   	add a,c
   ;third iteration, get the next 2 bits
   	ld c,a
   	sla l
   	rr h
   	sbc a,a
   	xor l
   	and $E0
   	add a,c
   ;fourth iteration, get the last bit
   	ld c,a
   	ld a,l
   	add a,a
   	rrc h
   	xor h
   	and $80
   	xor c
   	neg
   	ret

Absolute Value

Here are a handful of optimised routines for the absolute value of a number:

absHL

   absHL:
        bit 7,h
        ret z
        xor a \ sub l \ ld l,a
        sbc a,a \ sub h \ ld h,a
        ret

absDE

   absDE:
        bit 7,d
        ret z
        xor a \ sub e \ ld e,a
        sbc a,a \ sub d \ ld d,a
        ret

absBC

   absBC:
        bit 7,b
        ret z
        xor a \ sub c \ ld c,a
        sbc a,a \ sub b \ ld b,a
        ret

absA

   absA:
        or a
        ret p
        neg
        ret

abs[reg8]

   abs[reg8]:
       xor a
       sub [reg8]
       ret m
       ld [reg8],a
       ret

Division

C_Div_D

This is a simple 8-bit division routine:


   C_Div_D:
   ;Inputs:
   ;     C is the numerator
   ;     D is the denominator
   ;Outputs:
   ;     A is the remainder
   ;     B is 0
   ;     C is the result of C/D
   ;     D,E,H,L are not changed
   ;
        ld b,8
        xor a
          sla c
          rla
          cp d
          jr c,$+4
            inc c
            sub d
          djnz $-8
        ret

BC_Div_DE

This divides BC by DE, storing the result in AC, remainder in HL

   BC_Div_DE:
   ;Inputs: BC,DE
   ;Outputs: DE unaffected, HL is remainder, AC is quotient, B=0
   ;20 bytes
   ;1098cc~1258cc, avg=1178cc
       ld hl,0
       ld a,b
       ld b,16
   _:
       sll c \ rla \ adc hl,hl \ sbc hl,de \ jr nc,$+4 \ add hl,de \ dec c
       djnz -_
       ret

DEHL_Div_C

This divides the 32-bit value in DEHL by C:

   DEHL_Div_C:
   ;Inputs:
   ;     DEHL is a 32 bit value where DE is the upper 16 bits
   ;     C is the value to divide DEHL by
   ;Outputs:
   ;    A is the remainder
   ;    B is 0
   ;    C is not changed
   ;    DEHL is the result of the division
   ;
        ld b,32
        xor a
          add hl,hl
          rl e \ rl d
          rla
          cp c
          jr c,$+4
            inc l
            sub c
          djnz $-11
        ret

DEHLIX_Div_C

   DEHLIX_Div_C:
   ;Inputs:
   ;     DEHLIX is a 48 bit value where DE is the upper 16 bits
   ;     C is the value to divide DEHL by
   ;Outputs:
   ;    A is the remainder
   ;    B is 0
   ;    C is not changed
   ;    DEHLIX is the result of the division
   ;
        ld b,48
        xor a
          add ix,ix
          adc hl,hl
          rl e \ rl d
          rla
          cp c
          jr c,$+5
            inc ixl
            sub c
          djnz $-15
        ret


HL_Div_C

   HL_Div_C:
   ;Inputs:
   ;     HL is the numerator
   ;     C is the denominator
   ;Outputs:
   ;     A is the remainder
   ;     B is 0
   ;     C is not changed
   ;     DE is not changed
   ;     HL is the quotient
   ;
          ld b,16
          xor a
            add hl,hl
            rla
            cp c
            jr c,$+4
              inc l
              sub c
            djnz $-7
          ret

HLDE_Div_C

   HLDE_Div_C:
   ;Inputs:
   ;     HLDE is a 32 bit value where HL is the upper 16 bits
   ;     C is the value to divide HLDE by
   ;Outputs:
   ;    A is the remainder
   ;    B is 0
   ;    C is not changed
   ;    HLDE is the result of the division
   ;
        ld b,32
        xor a
          sll e \ rl d
          adc hl,hl
          rla
          cp c
          jr c,$+4
            inc e
            sub c
          djnz $-12
        ret

RoundHL_Div_C

Returns the result of the division rounded to the nearest integer.

   RoundHL_Div_C:
   ;Inputs:
   ;     HL is the numerator
   ;     C is the denominator
   ;Outputs:
   ;     A is twice the remainder of the unrounded value 
   ;     B is 0
   ;     C is not changed
   ;     DE is not changed
   ;     HL is the rounded quotient
   ;     c flag set means no rounding was performed
   ;            reset means the value was rounded
   ;
          ld b,16
          xor a
            add hl,hl
            rla
            cp c
            jr c,$+4
              inc l
              sub c
            djnz $-7
          add a,a
          cp c
          jr c,$+3
            inc hl
          ret


Speed Optimised HL_div_10

By adding 9 bytes to the code, we save 87 cycles: (min speed = 636 t-states)

   DivHLby10:
   ;Inputs:
   ;     HL
   ;Outputs:
   ;     HL is the quotient
   ;     A is the remainder
   ;     DE is not changed
   ;     BC is 10
   
    ld bc,$0D0A
    xor a
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
   
    add hl,hl \ rla
    cp c
    jr c,$+4
      sub c
      inc l
    djnz $-7
    ret


Speed Optimised EHL_Div_10

By adding 20 bytes to the routine, we actually save 301 t-states. The speed is quite fast at a minimum of 966 t-states and a max of 1002:

   DivEHLby10:
   ;Inputs:
   ;     EHL
   ;Outputs:
   ;     EHL is the quotient
   ;     A is the remainder
   ;     D is not changed
   ;     BC is 10
   
    ld bc,$050a
    xor a
    sla e \ rla
    sla e \ rla
    sla e \ rla
   
    sla e \ rla
    cp c
    jr c,$+4
      sub c
      inc e
    djnz $-8
   
    ld b,16
   
    add hl,hl
    rla
    cp c
    jr c,$+4
    sub c
    inc l
    djnz $-7
    ret


Speed Optimised DEHL_Div_10

The minimum speed is now 1350 t-states. The cost was 15 bytes, the savings were 589 t-states

   DivDEHLby10:
   ;Inputs:
   ;     DEHL
   ;Outputs:
   ;     DEHL is the quotient
   ;     A is the remainder
   ;     BC is 10
   
    ld bc,$0D0A
    xor a
    ex de,hl
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
   
    add hl,hl \ rla
    cp c
    jr c,$+4
      sub c
      inc l
    djnz $-7
   
    ex de,hl
    ld b,16
   
    add hl,hl
    rla
    cp c
    jr c,$+4
    sub c
    inc l
    djnz $-7
    ret


A_Div_C (small)

This routine should only be used when C is expected to be greater than 16. In this case, the naive way is actually the fastest and smallest way:

    ld b,-1
    sub c
    inc b
    jr nc,$-2
    add a,c

Now B is the quotient, A is the remainder. It takes at least 26 t-states and at most 346 if you ensure that c>16

E_div_10 (tiny+fast)

This is how it would appear inline, since it is so small at 10 bytes (and 81 t-states). It divides E by 10, returning the result in H :

   e_div_10:
   ;returns result in H
        ld d,0
        ld h,d \ ld l,e
        add hl,hl
        add hl,de
        add hl,hl
        add hl,hl
        add hl,de
        add hl,hl



Square Root

RoundSqrtE

Returns the square root of E, rounded to the nearest integer:

   ;===============================================================
   sqrtE:
   ;===============================================================
   ;Input:
   ;     E is the value to find the square root of
   ;Outputs:
   ;     A is E-D^2
   ;     B is 0
   ;     D is the rounded result
   ;     E is not changed
   ;     HL is not changed
   ;Destroys:
   ;     C
   ;
           xor a               ;1      4         4
           ld d,a              ;1      4         4
           ld c,a              ;1      4         4
           ld b,4              ;2      7         7
   sqrtELoop:
           rlc d               ;2      8        32
           ld c,d              ;1      4        16
           scf                 ;1      4        16
           rl c                ;2      8        32
   
           rlc e               ;2      8        32
           rla                 ;1      4        16
           rlc e               ;2      8        32
           rla                 ;1      4        16
   
           cp c                ;1      4        16
           jr c,$+4            ;4    12|15      48+3x
             inc d             ;--    --        --
             sub c             ;--    --        --
           djnz sqrtELoop      ;2    13|8       47
           cp d                ;1      4         4
           jr c,$+3            ;3    12|11     12|11
             inc d             ;--    --        --
           ret                 ;1     10        10
   ;===============================================================
   ;Size  : 29 bytes
   ;Speed : 347+3x cycles plus 1 if rounded down
   ;   x is the number of set bits in the result.
   ;===============================================================


SqrtE

This returns the square root of E (rounded down).

   ;===============================================================
   sqrtE:
   ;===============================================================
   ;Input:
   ;     E is the value to find the square root of
   ;Outputs:
   ;     A is E-D^2
   ;     B is 0
   ;     D is the result
   ;     E is not changed
   ;     HL is not changed
   ;Destroys:
   ;     C=2D+1 if D is even, 2D-1 if D is odd
   
           xor a               ;1      4         4
           ld d,a              ;1      4         4
           ld c,a              ;1      4         4
           ld b,4              ;2      7         7
   sqrtELoop:
           rlc d               ;2      8        32
           ld c,d              ;1      4        16
           scf                 ;1      4        16
           rl c                ;2      8        32
   
           rlc e               ;2      8        32
           rla                 ;1      4        16
           rlc e               ;2      8        32
           rla                 ;1      4        16
   
           cp c                ;1      4        16
           jr c,$+4            ;4    12|15      48+3x
             inc d             ;--    --        --
             sub c             ;--    --        --
           djnz sqrtELoop      ;2    13|8       47
           ret                 ;1     10        10
   ;===============================================================
   ;Size  : 25 bytes
   ;Speed : 332+3x cycles
   ;   x is the number of set bits in the result. This will not
   ;   exceed 4, so the range for cycles is 332 to 344. To put this
   ;   into perspective, under the slowest conditions (4 set bits
   ;   in the result at 6MHz), this can execute over 18000 times
   ;   in a second.
   ;===============================================================


SqrtHL

This returns the square root of HL (rounded down). It is faster than division, interestingly:

   SqrtHL4:
   ;39 bytes
   ;Inputs:
   ;     HL
   ;Outputs:
   ;     BC is the remainder
   ;     D is not changed
   ;     E is the square root
   ;     H is 0
   ;Destroys:
   ;     A
   ;     L is a value of either {0,1,4,5}
   ;       every bit except 0 and 2 are always zero
   
        ld bc,0800h   ;3  10      ;10
        ld e,c        ;1  4       ;4
        xor a         ;1  4       ;4
   SHL4Loop:          ;           ;
        add hl,hl     ;1  11      ;88
        rl c          ;2  8       ;64
        adc hl,hl     ;2  15      ;120
        rl c          ;2  8       ;64
        jr nc,$+4     ;2  7|12    ;96+3y   ;y is the number of overflows. max is 2
        set 0,l       ;2  8       ;--
        ld a,e        ;1  4       ;32
        add a,a       ;1  4       ;32
        ld e,a        ;1  4       ;32
        add a,a       ;1  4       ;32
        bit 0,l       ;2  8       ;64
        jr nz,$+5     ;2  7|12    ;144-6y
        sub c         ;1  4       ;32
        jr nc,$+7     ;2  7|12    ;96+15x  ;number of bits in the result
            ld a,c    ;1  4       ;
            sub e     ;1  4       ;
            inc e     ;1  4       ;
            sub e     ;1  4       ;
            ld c,a    ;1  4       ;
        djnz SHL4Loop ;2  13|8    ;99
        bit 0,l       ;2  8       ;8
        ret z         ;1  11|19   ;11+8z
        inc b         ;1          ;
        ret           ;1          ;
   ;1036+15x-3y+8z
   ;x is the number of set bits in the result
   ;y is the number of overflows (max is 2)
   ;z is 1 if 'b' is returned as 1
   ;max is 1154 cycles
   ;min is 1032 cycles


SqrtL

This returns the square root of L, rounded down:


   SqrtL:
   ;Inputs:
   ;     L is the value to find the square root of
   ;Outputs:
   ;      C is the result
   ;      B,L are 0
   ;     DE is not changed
   ;      H is how far away it is from the next smallest perfect square
   ;      L is 0
   ;      z flag set if it was a perfect square
   ;Destroyed:
   ;      A
        ld bc,400h       ; 10    10
        ld h,c           ; 4      4
   sqrt8Loop:            ;
        add hl,hl        ;11     44
        add hl,hl        ;11     44
        rl c             ; 8     32
        ld a,c           ; 4     16
        rla              ; 4     16
        sub a,h          ; 4     16
        jr nc,$+5        ;12|19  48+7x
          inc c
          cpl
          ld h,a
        djnz sqrt8Loop   ;13|8   47
        ret              ;10     10
   ;287+7x
   ;19 bytes



ConvFloat (or ConvOP1)

This converts a floating point number pointed to by DE to a 16 bit-value. This is like bcall(_ConvOP1) without the limit of 9999 and a bit more flexible (since the number doesn't need to be at OP1):


   ConvOP1: 
   ;;Output: HL is the 16-bit result. 
       ld de,OP1 
   ConvFloat: 
   ;;Input: DE points to the float. 
   ;;Output: HL is the 16-bit result. 
   ;;Errors: DataType if the float is negative or complex 
   ;;        Domain if the integer exceeds 16 bits. 
   ;;Timings:  Assume no errors were called. 
   ;;  Input is on: 
   ;;  (0,1)         => 59cc                        Average=59 
   ;;  0 or [1,10)   => 120cc or 129cc                     =124.5 
   ;;  [10,100)      => 176cc or 177cc                     =176.5 
   ;;  [100,1000)    => 309cc, 310cc, 318cc, or 319cc.     =314 
   ;;  [1000,10000)  => 376cc to 378cc                     =377 
   ;;  [10000,65536) => 514cc to 516cc, or 523cc to 525cc  =519.5 
   ;;Average case:  496.577178955078125cc 
   ;;vs 959.656982421875cc 
   ;;87 bytes 
       ld a,(de) 
       or a 
       jr nz,ErrDataType 
       inc de 
       ld hl,0 
       ld a,(de) 
       inc de 
       sub 80h 
       ret c 
       jr z,final 
       cp 5 
       jp c,enterloop 
   ErrDomain: 
   ;Throws a domain error. 
       bcall(_ErrDomain) 
   ErrDataType: 
   ;Throws a data type error. 
       bcall(_ErrDataType) 
   loop: 
       ld a,b 
       ld b,h 
       ld c,l 
       add hl,hl 
       add hl,bc 
       add hl,hl 
       add hl,hl 
       add hl,hl 
       add hl,bc 
       add hl,hl 
       add hl,hl 
   enterloop: 
       ld b,a 
       ex de,hl 
       ld a,(hl) \ and $F0 \ rra \ ld c,a \ rra \ rra \ sub c \ add a,(hl) 
       inc hl 
       ex de,hl 
       add a,l 
       ld l,a 
       jr nc,$+3 
       inc h 
       dec b 
       ret z 
       djnz loop 
       ld b,h 
       ld c,l 
       xor a 
   ;check overflow in this mul by 10! 
       add hl,hl \ adc a,a 
       add hl,hl \ adc a,a 
       add hl,bc \ adc a,0 
       add hl,hl \ adc a,a 
       jr nz,ErrDomain 
   final: 
       ld a,(de) 
       rrca 
       rrca 
       rrca 
       rrca 
       and 15 
       add a,l 
       ld l,a 
       ret nc 
       inc h 
       ret nz
       jr ErrDomain

source: Cemetech/Useful Routines

ConvStr16

This will convert a string of base-10 digits to a 16-bit value. Useful for parsing numbers in a string:

   ;===============================================================
   ConvRStr16:
   ;===============================================================
   ;Input: 
   ;     DE points to the base 10 number string in RAM. 
   ;Outputs: 
   ;     HL is the 16-bit value of the number 
   ;     DE points to the byte after the number 
   ;     BC is HL/10 
   ;     z flag reset (nz)
   ;     c flag reset (nc)
   ;Destroys: 
   ;     A (actually, add 30h and you get the ending token) 
   ;Size:  23 bytes 
   ;Speed: 104n+42+11c
   ;       n is the number of digits 
   ;       c is at most n-2 
   ;       at most 595 cycles for any 16-bit decimal value 
   ;===============================================================
        ld hl,0          ;  10 : 210000 
   ConvLoop:             ; 
        ld a,(de)        ;   7 : 1A 
        sub 30h          ;   7 : D630 
        cp 10            ;   7 : FE0A 
        ret nc           ;5|11 : D0 
        inc de           ;   6 : 13 
                         ; 
        ld b,h           ;   4 : 44 
        ld c,l           ;   4 : 4D 
        add hl,hl        ;  11 : 29 
        add hl,hl        ;  11 : 29 
        add hl,bc        ;  11 : 09 
        add hl,hl        ;  11 : 29 
                         ; 
        add a,l          ;   4 : 85 
        ld l,a           ;   4 : 6F 
        jr nc,ConvLoop   ;12|23: 30EE 
        inc h            ; --- : 24 
        jr ConvLoop      ; --- : 18EB


gcdHL_DE

This computes the Greatest Common Divisor of HL and DE, using the binary gcd algorithm.

   gcdHL_DE:
   ;gcd(HL,DE)->HL
   ;binary GCD algorithm
       ld a,h \ or l \ ret z
       ex de,hl
       ld a,h \ or l \ ret z
       sbc hl,de
       add hl,de
       ret z
       ld b,1
       ld a,e \ or l \ rra \ jr c,+_
       inc b
       rr h \ rr l
       rr d \ rr e
       ld a,e \ or l \ rra \ jr nc,$-12
   _:
       srl h \ rr l \ jr nc,$-4 \ adc hl,hl
       ex de,hl
   _:
       srl h \ rr l \ jr nc,$-4 \ adc hl,hl
       xor a \ sbc hl,de
       jr z,+_
       jr nc,-_ \ sub l \ ld l,a \ sbc a,a \ sub h \ ld h,a
       jp -_-1
   _:
       ex de,hl
       dec b
       ret z
       add hl,hl
       djnz $-1
       ret

Modulus

DEHL_mod_3

See below.

HLDE_mod_3

See below.

HL_mod_3

See below.

A_mod_3

Computes A mod 3 (essentially, the remainder of A after division by 3):

   DEHL_mod_3:
   HLDE_mod_3:
   ;Inputs: 
   ;  HL:DE  unsigned integer 
   ;Outputs: 
   ;  A = HLDE mod 3 
   ;  Z flag is set if divisible by 3 
   ;Destroys: 
   ;  C 
   ; 27 bytes, 103cc,118cc,104cc,119cc, avg=114.75cc
        add hl,de
        jr nc,$+3
        dec hl
   HL_mod_3:
   ;Inputs: 
   ;  HL  unsigned integer 
   ;Outputs: 
   ;  A = HL mod 3 
   ;  Z flag is set if divisible by 3 
   ;Destroys: 
   ;  C 
   ; 23 bytes, 80cc or 95cc, avg 91.25cc
       ld a,h
       add a,l
       sbc a,0   ;conditional decrement
   A_mod_3: 
   ;Inputs: 
   ;  A  unsigned integer 
   ;Outputs: 
   ;  A = A mod 3 
   ;  Z flag is set if divisible by 3 
   ;Destroys: 
   ;  C 
   ; 19 bytes,  65cc or 80cc, avg=76.25cc
       ld c,a                     ;add nibbles  
       rrca / rrca / rrca / rrca  
       add a,c  
       adc a,0                    ;n mod 15 (+1) in both nibbles  
       ld c,a                     ;add half nibbles  
       rrca / rrca  
       add a,c  
       adc a,1  
       ret z  
       and 3 
       dec a 
       ret

source: Cemetech/Fast 8-bit mod 3


A_mod_10:

This is not a typical method used, but it is small and fast at 196 to 201 t-states, 12 bytes

   ld bc,05A0h
   Loop:
        sub c
        jr nc,$+3
          add a,c
        srl c
        djnz Loop
        ret

Random Numbers

rand8_LCG

This is one of many variations of PRNGs. This routine is not particularly useful for many games, but is fairly useful for shuffling a deck of cards. It uses SMC, but that can be fixed by defining randSeed elsewhere and using ld a,(randSeed) at the beginning.

   rand8_LCG:
   ;f(n+1)=13f(n)+83
   ;97 cycles
   randSeed=$+1
       ld a,3
       ld c,a
       add a,a
       add a,c
       add a,a
       add a,a
       add a,c
       add a,83
       ld (randSeed),a
       ret

rand16_LCG

Similar to the rand8_LCG, this generates a a sequence of pseudo-random values that has a cycle of 65536 (so it will hit every single 16-bit integer):

   rand16_LCG:
   ;f(n+1)=241f(n)+257   ;65536
   ;181 cycles, add 17 if called
   ;Outputs:
   ;     BC was the previous pseudorandom value
   ;     HL is the next pseudorandom value
   ;Notes:
   ;     You can also use B,C,H,L as pseudorandom 8-bit values
   ;     this will generate all 8-bit values
   randSeed=$+1
       ld hl,235
       ld c,l
       ld b,h
       add hl,hl
       add hl,bc
       add hl,hl
       add hl,bc
       add hl,hl
       add hl,bc
       add hl,hl
       add hl,hl
       add hl,hl
       add hl,hl
       add hl,bc
       inc h
       inc hl
       ld (randSeed),hl
       ret

rand; best quality:speed

This routine uses a combined LFSR and LCG to offer an extremely fast and proven high quality pseudo random numbers.

   rand: 
   ;;Tested and passes all CAcert tests 
   ;;Uses a very simple 32-bit LCG and 32-bit LFSR 
   ;;it has a period of 18,446,744,069,414,584,320 
   ;;roughly 18.4 quintillion. 
   ;;291cc 
   ;;58 bytes 
   seed1_0=$+1 
       ld hl,12345 
   seed1_1=$+1 
       ld de,6789 
       ld b,h 
       ld c,l 
       add hl,hl \ rl e \ rl d 
       add hl,hl \ rl e \ rl d 
       inc l 
       add hl,bc 
       ld (seed1_0),hl 
       ld hl,(seed1_1) 
       adc hl,de 
       ld (seed1_1),hl 
       ex de,hl 
   seed2_0=$+1 
       ld hl,9876 
   seed2_1=$+1 
       ld bc,54321 
       add hl,hl \ rl c \ rl b 
       ld (seed2_1),bc 
       sbc a,a 
       and %11000101 
       xor l 
       ld l,a 
       ld (seed2_0),hl 
       ex de,hl 
       add hl,bc 
       ret

source: Cemetech/Useful Routines

rand; very fast

The cycle for this is more limited, but is still quite large and well suited to games.

   rand:
   ;collab with Runer112
   ;;Output:
   ;;    HL is a pseudo-random int
   ;;    A and BC are also, but much weaker and smaller cycles
   ;;    Preserves DE
   ;;148cc, super fast
   ;;26 bytes
   ;;period length: 4,294,901,760
   seed1=$+1
       ld hl,9999
       ld b,h
       ld c,l
       add hl,hl
       add hl,hl
       inc l
       add hl,bc
       ld (seed1),hl
   seed2=$+1
       ld hl,987
       add hl,hl
       sbc a,a
       and %00101101
       xor l
       ld l,a
       ld (seed2),hl
       add hl,bc
       ret

source: Zeda's Pastebin/rand

randInt

This returns a random integer on [0,A-1].

   rand:
   ;;Input: A is the range.
   ;;Output: Returns in A a random number from 0 to B-1.
   ;;  B=0
   ;;  DE is not changed
   ;;Destroys:
   ;;  HL
   ;;Speed:
   ;;  322cc to 373cc, 347.5cc average
       push af
       call rand
       ex de,hl
       pop af
       ld hl,0
       ld b,h
       add a,a \ jr nc,$+5 \ ld h,d \ ld l,e
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b
       add hl,hl \ rla \ ret nc \ add hl,de \ adc a,b
       ret

source: Zeda's Pastebin/rand

Fixed Point Math

Fixed Point numbers are similar to Floating Point numbers in that they give the user a way to work with non-integers. For some terminology, an 8.8 Fixed Point number is 16 bits where the upper 8 bits is the integer part, the lower 8 bits is the fractional part. Both Floating Point and Fixed Point are abbreviated 'FP', but one can tell if Fixed Point is being referred to by context. The way one would interpret an 8.8 FP number would be to take the upper 8 bits as the integer part and divide the lower 8-bits by 256 (2[sup]8[/sup]) so if HL is an 8.8 FP number that is $1337, then its value is 19+55/256 = 19.21484375. In most cases, integers are enough for working in Z80 Assembly, but if that doesn't work, you will rarely need more than 16.16 FP precision (which is 32 bits in all). FP algorithms are generally pretty similar to their integer counterparts, so it isn't too difficult to convert.

FPLog88

This is an 8.8 fixed point natural log routine. This is extremely accurate. In the very worst case, it is off by 2/256, but on average, it is off by less than 1/256 (the smallest unit for an 8.8 FP number).

   FPLog88:
   ;Input:
   ;     HL is the 8.8 Fixed Point input. H is the integer part, L is the fractional part.
   ;Output:
   ;     HL is the natural log of the input, in 8.8 Fixed Point format.
        ld a,h
        or l
        dec hl
        ret z
        inc hl
        push hl
        ld b,15
        add hl,hl
        jr c,$+4
        djnz $-3
        ld a,b
        sub 8
        jr nc,$+4
        neg
        ld b,a
        pop hl
        push af
        jr nz,lnx
        jr nc,$+7
        add hl,hl
        djnz $-1
        jr lnx
        sra h
        rr l
        djnz $-4
   lnx:
        dec h        ;subtract 1 so that we are doing ln((x-1)+1) = ln(x)
        push hl      ;save for later
        add hl,hl    ;we are doing the 4x/(4+4x) part
        add hl,hl
        ld d,h
        ld e,l
        inc h
        inc h
        inc h
        inc h
        call FPDE_Div_HL  ;preserves DE, returns AHL as the 16.8 result
        pop de       ;DE is now x instead of 4x
        inc h        ;now we are doing x/(3+Ans)
        inc h
        inc h
        call FPDE_Div_HL
        inc h        ;now we are doing x/(2+Ans)
        inc h
        call FPDE_Div_HL
        inc h        ;now we are doing x/(1+Ans)
        call FPDE_Div_HL  ;now it is computed to pretty decent accuracy
        pop af       ;the power of 2 that we divided the initial input by
        ret z        ;if it was 0, we don't need to add/subtract anything else
        ld b,a
        jr c,SubtLn2
        push hl
        xor a
        ld de,$B172  ;this is approximately ln(2) in 0.16 FP format
        ld h,a
        ld l,a
        add hl,de
        jr nc,$+3
        inc a
        djnz $-4
        pop de
        rl l         ;returns c flag if we need to round up
        ld l,h
        ld h,a
        jr nc,$+3
        inc hl
        add hl,de
        ret
   SubtLn2:
        ld de,$00B1
          or a
          sbc hl,de
          djnz $-3
        ret
   
   
   FPDE_Div_HL:
   ;Inputs:
   ;     DE,HL are 8.8 Fixed Point numbers
   ;Outputs:
   ;     DE is preserved
   ;     AHL is the 16.8 Fixed Point result (rounded to the least significant bit)
        di
        push de
        ld b,h
        ld c,l
        ld a,16
        ld hl,0
   Loop1:
        sla e
        rl d
        adc hl,hl
        jr nc,$+8
        or a
        sbc hl,bc
        jp incE
        sbc hl,bc
        jr c,$+5
   incE:
        inc e
        jr $+3
        add hl,bc
        dec a
        jr nz,Loop1
        ex af,af'
        ld a,8
   Loop2:
        ex af,af'
        sla e
        rl d
        rl a
        ex af,af'
        add hl,hl
        jr nc,$+8
        or a
        sbc hl,bc
        jp incE_2
        sbc hl,bc
        jr c,$+5
   incE_2:
        inc e
        jr $+3
        add hl,bc
        dec a
        jr nz,Loop2
   ;round
        ex af,af'
        add hl,hl
        jr c,$+6
        sbc hl,de
        jr c,$+9
        inc e
        jr nz,$+6
        inc d
        jr nz,$+3
        inc a
        ex de,hl
        pop de
        ret

FPDE_Div_BC88

This performs Fixed Point division for DE/BC where DE and BC are 8.8 FP numbers. This returns a little extra precision for the integer part (16-bit integer part, 8-bit fractional part).

   FPDE_Div_BC88:
   ;Inputs:
   ;     DE,BC are 8.8 Fixed Point numbers
   ;Outputs:
   ;     ADE is the 16.8 Fixed Point result (rounded to the least significant bit)
        di
        ld a,16
        ld hl,0
   Loop1:
        sla e
        rl d
        adc hl,hl
        jr nc,$+8
        or a
        sbc hl,bc
        jp incE
        sbc hl,bc
        jr c,$+5
   incE:
        inc e
        jr $+3
        add hl,bc
        dec a
        jr nz,Loop1
        ex af,af'
        ld a,8
   Loop2:
        ex af,af'
        sla e
        rl d
        rla
        ex af,af'
        add hl,hl
        jr nc,$+8
        or a
        sbc hl,bc
        jp incE_2
        sbc hl,bc
        jr c,$+5
   incE_2:
        inc e
        jr $+3
        add hl,bc
        dec a
        jr nz,Loop2
   ;round
        ex af,af'
        add hl,hl
        jr c,$+5
        sbc hl,de
        ret c
        inc e
        ret nz
        inc d
        ret nz
        inc a
        ret

Log_2_88

These computes log base 2 of the fixed point 8.8 number. This is much faster and smaller than the natural log routine above.

(size optimised)

   Log_2_88_size:
   ;Inputs:
   ;     HL is an unsigned 8.8 fixed point number.
   ;Outputs:
   ;     HL is the signed 8.8 fixed point value of log base 2 of the input.
   ;Example:
   ;     pass HL = 3.0, returns 1.58203125 (actual is ~1.584962501...)
   ;averages about 39 t-states slower than original
   ;62 bytes
        ex de,hl
        ld hl,0
        ld a,d
        ld c,8
        or a
        jr z,DE_lessthan_1
        srl d
        jr z,logloop-1
        inc l
        rr e
        jr $-7
   DE_lessthan_1:
        ld a,e
        dec hl
        or a
        ret z
        inc l
        dec l
        add a,a
        jr nc,$-2
        ld e,a
   
        inc d
   logloop:
        add hl,hl
        push hl
        ld h,d
        ld l,e
        ld a,e
        ld b,8
   
        add hl,hl
        rla
        jr nc,$+5
          add hl,de
          adc a,0
        djnz $-7
   
        ld e,h
        ld d,a
        pop hl
        rr a           ;this is right >_>
        jr z,$+7
          srl d
          rr e
          inc l
        dec c
        jr nz,logloop
        ret

(speed optimised)

   Log_2_88_speed:
   ;Inputs:
   ;     HL is an unsigned 8.8 fixed point number.
   ;Outputs:
   ;     HL is the signed 8.8 fixed point value of log base 2 of the input.
   ;Example:
   ;     pass HL = 3.0, returns 1.58203125 (actual is ~1.584962501...)
   ;saves at least 688 t-states over regular (about 17% speed boost)
   ;98 bytes
        ex de,hl
        ld hl,0
        ld a,d
        ld c,8
        or a
        jr z,DE_lessthan_1
        srl d
        jr z,logloop-1
        inc l
        rr e
        jp $-7
   DE_lessthan_1:
        ld a,e
        dec hl
        or a
        ret z
        inc l
        dec l
        add a,a
        jr nc,$-2
        ld e,a
   
        inc d
   logloop:
        add hl,hl
        push hl
        ld h,d
        ld l,e
        ld a,e
        ld b,7
   
        add hl,hl
        rla
        jr nc,$+3
          add hl,de
   
        add hl,hl
        rla
        jr nc,$+3
          add hl,de
   
        add hl,hl
        rla
        jr nc,$+3
          add hl,de
   
        add hl,hl
        rla
        jr nc,$+3
          add hl,de
   
        add hl,hl
        rla
        jr nc,$+3
          add hl,de
   
        add hl,hl
        rla
        jr nc,$+3
          add hl,de
   
        add hl,hl
        rla
        jr nc,$+5
          add hl,de
          adc a,0
   
        add hl,hl
        rla
        jr nc,$+5
          add hl,de
          adc a,0
   
        ld e,h
        ld d,a
        pop hl
        rr a
        jr z,$+7
          srl d
          rr e
          inc l
        dec c
        jr nz,logloop
        ret

(balanced)

(this only saves about 40 cycles over the size optimised one)

   Log_2_88:
   ;Inputs:
   ;     HL is an unsigned 8.8 fixed point number.
   ;Outputs:
   ;     HL is the signed 8.8 fixed point value of log base 2 of the input.
   ;Example:
   ;     pass HL = 3.0, returns 1.58203125 (actual is ~1.584962501...)
   ;70 bytes
        ex de,hl
        ld hl,0
        ld a,d
        ld c,8
        or a
        jr z,DE_lessthan_1
        srl d
        jr z,logloop-1
        inc l
        rr e
        jp $-7
   DE_lessthan_1:
        ld a,e
        dec hl
        or a
        ret z
        inc l
        dec l
        add a,a
        jr nc,$-2
        ld e,a
   
        inc d
   logloop:
        add hl,hl
        push hl
        ld h,d
        ld l,e
        ld a,e
        ld b,7
   
        add hl,hl
        rla
        jr nc,$+3
          add hl,de
        djnz $-5
   
        adc a,0
        add hl,hl
        rla
        jr nc,$+5
          add hl,de
          adc a,0
        ld e,h
        ld d,a
        pop hl
        rr a
        jr z,$+7
          srl d
          rr e
          inc l
        dec c
        jr nz,logloop
        ret