z80:Unrolled Loops

From Learn @ Cemetech
Jump to navigationJump to search

Unrolled loops provide an easy to use speed optimisation for many routines, especially math routines. The premise is that if you have to loop through some code a fixed number of times, instead of using a loop counter (often we use B to take advantage of DJNZ), we can copy the looping code the fixed number of times.

Let's begin this by looking at a generic multiplication routine:

   DE_Times_A:
   ;Inputs:
   ;     DE and A are factors
   ;Outputs:
   ;     A is not changed
   ;     B is 0
   ;     C is not changed
   ;     DE is not changed
   ;     HL is the product
   ;Time:
   ;     342+6x
   ;
        ld b,8          ;7           7
        ld hl,0         ;10         10
          add hl,hl     ;11*8       88
          rlca          ;4*8        32
          jr nc,$+3     ;(12|18)*8  96+6x
            add hl,de   ;--         --
          djnz $-5      ;13*7+8     99
        ret             ;10         10

The numbers on the right reflect how many t-states (also known as clock cycles) that the instruction uses. At 6MHz, the calculator executes 6000000 clock cycles. From this, we see that DJNZ uses 13 clock cycles unless B is decremented to zero, at which point it does not jump back and instead uses 8 cycles. Unrolling this loop we no longer need B as a counter and it would look like:

   DE_Times_A:
   ;Inputs:
   ;     DE and A are factors
   ;Outputs:
   ;     A is not changed
   ;     BC is not changed
   ;     DE is not changed
   ;     HL is the product
   ;Time:
   ;     342+6x
   ;
        ld hl,0         ;10         10
   ;1
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
   ;2
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
   ;3
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
   ;4
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
   ;5
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
   ;6
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
   ;7
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
   ;8
          add hl,hl     ;11
          rlca          ;4
          jr nc,$+3     ;(12|18)
            add hl,de   ;--         --
        ret             ;10         10

This saves us 7+13*8-5 t-states (106) which is about a 31% gain in speed. But there are a few small optimisations that can be made to this for speed and size. First note that the first 'add hl,hl' is redundant since HL is initially 0. Likewise, if the first 'rlca' returns nc, then the second 'add hl,hl' is also redundant, so we can jump over that, conditionally, too. At the end, 'jr nc,$+3' simply jumps to an RET, so we can replace that with 'RET NC'. Putting this together:

   DE_Times_A:
   ;Inputs:
   ;     DE and A are factors
   ;Outputs:
   ;     A is not changed
   ;     BC is not changed
   ;     DE is not changed
   ;     HL is the product
   ;Time:
   ;     342+6x
   ;
        ld hl,0         ;10         10
   ;1
        rlca          ;4
        jr nc,$+4     ;(12|29)
          add hl,de   ;--
          add hl,hl     ;--
   ;2
        rlca          ;4
        jr nc,$+3     ;(12|18)
          add hl,de   ;--
   ;3
        add hl,hl     ;11
        rlca          ;4
        jr nc,$+3     ;(12|18)
          add hl,de   ;--
   ;4
        add hl,hl     ;11
        rlca          ;4
        jr nc,$+3     ;(12|18)
          add hl,de   ;--
   ;5
        add hl,hl     ;11
        rlca          ;4
        jr nc,$+3     ;(12|18)
          add hl,de   ;--
   ;6
        add hl,hl     ;11
        rlca          ;4
        jr nc,$+3     ;(12|18)
          add hl,de   ;--
   ;7
        add hl,hl     ;11
        rlca          ;4
        jr nc,$+3     ;(12|18)
          add hl,de   ;--
   ;8
        add hl,hl     ;11
        rlca          ;4
        ret nc       ;26|11
        add hl,de   ;--
        ret             ;--         10

So now it is 2 bytes smaller and a few cycles faster (at least 13). At the fastest, it is 203 cycles, which is about a 41% gain in speed from the original fastest, and the slowest is 271 cycles, a 31% gain in speed from the original. However, comparing the sizes, the optimised unrolled code is 43 bytes compared to 13 bytes.

However, there are scenarios that paint the optimisation of unrolling code in a much better light. For example, Say we want to specifically make a routine to multiply HL by 10. This is pretty common when converting a string of decimal digits. Here is a solution:

   HL_Times_10:
        ex de,hl
        ld a,10
   DE_Times_A:
   ;Inputs:
   ;     DE and A are factors
   ;Outputs:
   ;     A is not changed
   ;     B is 0
   ;     C is not changed
   ;     DE is not changed
   ;     HL is the product
   ;Time:
   ;     342+6x
   ;
        ld b,8          ;7           7
        ld hl,0         ;10         10
          add hl,hl     ;11*8       88
          rlca          ;4*8        32
          jr nc,$+3     ;(12|18)*8  96+6x
            add hl,de   ;--         --
          djnz $-5      ;13*7+8     99
        ret             ;10         10

But wait a minute-- A=10 in binary is A=00001010,,2,,, so we know that the first 4 shifts of A will result in NC. This is a huge waste of cycles! Before unrolling, let's optimise our code a little. Start with A shifted 4 bits and we only need to read the remaining 4 bits:

   HL_Times_10:
        ex de,hl
        ld a,%10100000
        ld b,4          ;7           7
        ld hl,0         ;10         10
          add hl,hl     ;11*4       44
          rlca          ;4*4        16
          jr nc,$+3     ;(12|18)*4  48+12=60
            add hl,de   ;--         --
          djnz $-5      ;13*4-5     47
        ret             ;10         10

So now the routine is down to 205 cycles compared to 365-- a 44% gain in speed. But now we can be really snazzy with our optimisation. Since now we are only looking at the bits 1010, we know that every other cycle we are going to skip 'add hl,de', so we will only be doing 'add hl,hl'. What sorcery do we use, then? Let's partially unroll some of the code and only loop twice:

   HL_Times_10:
        ex de,hl        ;4
        ld a,160        ;7
        ld b,2          ;7           7
        ld hl,0         ;10         10
          add hl,hl     ;11*2       22
          rlca          ;4*2        8
          jr nc,$+3     ;(12|18)*2  36
            add hl,de   ;--         --
   
          add hl,hl     ;11*2       22
          rlca          ;4*2        8
   
          djnz $-5      ;13*2-5     21
        ret             ;10         10

But really, now we know that the first 'rlca' will always return c, so we no longer need to jump conditionally-- we know the condition:

   HL_Times_10:
        ex de,hl        ;4
        ld a,160        ;7
        ld b,2          ;7           7
        ld hl,0         ;10         10
          add hl,hl     ;11*2       22
          rlca          ;4*2        8
          add hl,de     ;11*2        22
   
          add hl,hl     ;11*2       22
          rlca          ;4*2        8
   
          djnz $-5      ;13*2-5     21
        ret             ;10         10

Actually, since we know the condition, we don't even need 'a' anymore:

   HL_Times_10:
        ex de,hl        ;4
        ld b,2          ;7           7
        ld hl,0         ;10         10
          add hl,hl     ;11*2       22
          add hl,de     ;11*2       22
          add hl,hl     ;11*2       22
          djnz $-5      ;13*2-5     21
        ret             ;10         10

Now it is 118 cycles compared to 365-- about 68% speed gain. Now let's see if you know where we are going. Unroll:

   HL_Times_10:
        ex de,hl        ;4
        ld hl,0         ;10
          add hl,hl     ;11
          add hl,de     ;11
          add hl,hl     ;11
          add hl,hl     ;11
          add hl,de     ;11
          add hl,hl     ;11
        ret             ;10

Interestingly, this is actually 1 byte smaller. But that isn't all. Observe that the first 'add hl,hl' is redundant. HL is initialised to 0, so it just results in 0.

   HL_Times_10:
        ex de,hl        ;4
        ld hl,0         ;10
          add hl,de     ;11
          add hl,hl     ;11
          add hl,hl     ;11
          add hl,de     ;11
          add hl,hl     ;11
        ret             ;10

Two bytes smaller. This means that the first time it does add hl,de, we are basically setting HL=DE. Why do we even need to set HL=0? If we set HL=DE, we don't even need the 'ex de,hl' or the first 'add hl,de':

   HL_Times_10:
        ld d,l          ;4
        ld e,h          ;4
          add hl,hl     ;11
          add hl,hl     ;11
          add hl,de     ;11
          add hl,hl     ;11
        ret             ;10

And now we have gone to 7 bytes of code at 62 t-states, compared to 16 bytes and 365 t-states. We cut the code size in less than half and the code runs almost 6 times faster. On top of that, we can use BC instead of DE if we want to. The code is small enough that you might even consider using it inline instead of as a subroutine, saving the 10 cycles from the RET and 17 cycles from using a CALL instruction.

Constant multiplication is often much faster and often smaller when unrolled. Another common use is HL_Mul_12, when dealing with graphics. Since there are 12 bytes to each row on the graph buffer, if HL has the Y coordinate, we can find the offset into the buffer:

   HL_Mul_12:
        ld b,h
        ld c,l
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,hl

At 52 cycles, that is often used inline, since making it a subroutine would add another 17+10 cycles which isn't really worth the size optimisation when dealing with graphics. You would need to call it as a subroutine in at least three places to even get any kind of size optimisation, anyways.

A_Times_10:

   ld c,a
        add a,a
        add a,a
        add a,c
        add a,a

In fact, if you are ever multiplying by an even integer, you are sure to have the last instruction as 'add a,a' or 'add hl,hl'. So really, we typically only fret about multiplying by odd values. One interesting remark to make is that if you unroll the loop for multiplying by 15, you come to the code:

   ld b,h
        ld c,l
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc

But it is actually a little faster to first multiply HL by 3, then multiply that result by 5. Reducing an odd multiple to its factors is not always an otpimisation, but it happens to be in a few cases:

   ;HL_Times_3:
        ld b,h
        ld c,l
        add hl,hl
        add hl,bc
   ;HL_Times_5:
        ld b,h
        ld c,l
        add hl,hl
        add hl,hl
        add hl,bc

It is 3 cycles faster, but 1 byte larger, due to the fact that 'ld b,h \ ld c,l' is 8 cycles, compared to 11, for 'add hl,[reg16]'. You might note that 15=%00001111, and then notice that %00011111 is prime, so it cannot be factored like this, but %00111111 (and any other similar value with an even number of 1s) can be factored to %00001001 * %00000111 so to multiply by 63: (Method 1: Unrolling)

   ld b,h
        ld c,l
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc
   
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc

(Method 2: Factor)

   HL_Mul_7:
        ld b,h
        ld c,l
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc
   
   ;HL_Mul_9:
        ld b,h
        ld c,l
        add hl,hl
        add hl,hl
        add hl,hl
        add hl,bc

The next such value is %11111111 = 255. Unrolled:

   ld b,h
        ld c,l
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc
   
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc
   
        add hl,hl
        add hl,bc
        add hl,hl
        add hl,bc

(Method 2: Factor)

   HL_Mul_15:
   ;HL_Mul_3:
        ld b,h
        ld c,l
        add hl,hl
        add hl,bc
   ;HL_Mul_5:
        ld b,h
        ld c,l
        add hl,hl
        add hl,hl
        add hl,bc
   
   ;HL_Mul_17:
        add hl,hl
        add hl,hl
        add hl,hl
        add hl,hl
        add hl,bc


But once again, we cannot always let unrolling code seem too appealing. (Method 3: Smarter way)

   ld b,h
        ld c,l
   ;HL_Mul_256:
        ld h,l
        ld l,0
   ;HL_Minus_BC:
        or a         ;reset the c flag
        sbc hl,bc

We just multiply by 256 and subtract the original. 38 cycles, 8 bytes, versus 134 cycles, 14 bytes. For more constant multiplication, see the optimised constant multiplication page.