KarolS/millfork

byte multiplication problem

zbyti opened this issue · 5 comments

zbyti commented
const word probe = 9999
const word radius = 127 * 127
pointer screen @ $80

asm void pause() {
  lda os_RTCLOK.b2
  .rt_check:
  cmp os_RTCLOK.b2
  beq .rt_check
  rts
}

// print in HEX
void printScore(word val) {
  array(byte) tmp[4]
  byte iter

  tmp[0] = val.hi >> 4
  tmp[1] = val.hi & %00001111
  tmp[2] = val.lo >> 4
  tmp[3] = val.lo & %00001111

  for iter:tmp {
    if tmp[iter] < 10 {
      screen[iter] = tmp[iter] + $10
    } else {
      screen[iter] = tmp[iter] + $17
    }
  }
  screen += 40
}

void main() {
  array(bool) flags[size] align(1024)
  word i@$e0, bingo@$e2
  word x@$e4, y@$e6, p@$e8, t@$ea
  byte n@$82

  screen = os_SAVMSC

  x = 0
  y = 0
  bingo = 0

  pause()
  os_RTCLOK = 0

  for i,0,to,probe {
    n = pokey_random & 127
    x = n * n
    n = pokey_random & 127
    y = n * n
    if ((x + y) <= radius) {
      bingo += 1
    }
  }
  p = 4 * bingo

  t = os_RTCLOK.b2 + (os_RTCLOK.b1 * 256)

  printScore(t)
  printScore(p)

  while true {}
}

atari000

If n is word everything is OK but byte produce wrong score, n * n is not promoted to word .

zbyti commented

I am aware of the difference in single results, but what interests me is the multiplication procedure implemented in a given language, then you can see a significant difference.

zbyti commented

In this benchmark Mad Pascal used:

/*

;
; Ullrich von Bassewitz, 2009-08-17
;
; CC65 runtime: 8x8 => 16 unsigned multiplication
;

*/
.proc	imulCL
ptr1 = ecx
ptr4 = eax
	
	ldy #8
	lda #0

        lsr     ptr4            ; Get first bit into carry
@L0:    bcc     @L1
        clc
        adc     ptr1
@L1:    ror	@
        ror     ptr4
        dey
        bne     @L0
        sta	ptr4+1

	rts
.endp
program MonteCarloPi;

uses crt;

{$define FAST}

var
	rtClock1              : byte absolute 19;
	rtClock2              : byte absolute 20;
	rndNumber             : byte absolute $D20A;

{$ifdef FAST}
	stop                  : word absolute $e0;
	i                     : word absolute $e0;
	r                     : word absolute $e2;
	x                     : word absolute $e4;
	y                     : word absolute $e6;
	bingo                 : word absolute $e8;
	probe                 : word absolute $ea;
	foundPi               : word absolute $ec;
	n                     : byte absolute $ee;
{$else}
	stop, i, r, x, y      : word;
	bingo, probe, foundPi : word;
	n                     : byte;
{$endif}

begin
	bingo := 0;
	r := 127 * 127;
	probe := 10000;

	Pause;	
	rtClock1 := 0; rtClock2 := 0;
	
	for i := 0 to probe do begin
		n := rndNumber and 127;
		x := n * n;
		n := rndNumber and 127;		
		y := n * n;
		if (x + y) <= r then Inc(bingo);
	end;
	
	foundPi := 4 * bingo;
	stop := (rtClock1 * 256) + rtClock2;
	
	{$ifdef FAST}
	WriteLn('Variables on zero page');
	{$endif}
	WriteLn('Probe size ', probe);
	WriteLn('Points in circle ', bingo);
	Write('Found pi approximation ', foundPi);
	WriteLn(chr(30),chr(30),chr(30),chr(30),chr(255),chr(44));
	WriteLn('Frames counter = ', stop);
	ReadKey;
end.

@tebe6502 explained:

MP extends the type for multiplication and other operations,
for u8xu8 it assumes the result of 16b, only later during optimization
it will start to reject redundant operations

result is written to eax, eax + 1 (16b)

procedure in the tests turned out to be several scanning lines faster than the one used previously:

.proc	imulCL

	lda #$00

	LDY #$09
	CLC
LOOP	ROR @
	ROR eax
	BCC MUL2
	CLC		;DEC AUX above to remove CLC
	ADC ecx
MUL2	DEY
	BNE LOOP

	STA eax+1

	RTS
.endp
zbyti commented

CC65 chose not optimal:

; 8x16 routine with external entry points used by the 16x16 routine in mul.s
tosmula0:
tosumula0:
        sta     ptr4
mul8x16:jsr     popptr1         ; Get left operand (Y=0 by popptr1)

        tya                     ; Clear byte 1
        ldy     #8              ; Number of bits
        ldx     ptr1+1          ; check if lhs is 8 bit only
        beq     mul8x8          ; Do 8x8 multiplication if high byte zero
mul8x16a:
        sta     ptr4+1          ; Clear byte 2

        lsr     ptr4            ; Get first bit into carry
@L0:    bcc     @L1

        clc
        adc     ptr1
        tax
        lda     ptr1+1          ; hi byte of left op
        adc     ptr4+1
        sta     ptr4+1
        txa

@L1:    ror     ptr4+1
        ror     a
        ror     ptr4
        dey
        bne     @L0
        tax
        lda     ptr4            ; Load the result
        rts
#include <stdio.h>
#include <peekpoke.h>

#define tick 0x14
#define tack 0x13
#define rndp 0xd20a

void main(void)
{
  register unsigned int stop, i, x, y;
  register unsigned int b, p, r;
  register unsigned char n;

  b = 0;
  r = 127 * 127;
  p = 10000;

  n = PEEK(tick);
  while (PEEK(tick) == n) { ; }

  printf("\nMonte-Carlo PI cc65 V2.18\n");

  asm(" lda #0");
  asm(" sta $13");
  asm(" sta $14");

  for (i = 0 ; i < p; ++i)
  {
    n = (PEEK(rndp) | 128) ^ 128;
    x = n * n;
    n = (PEEK(rndp) | 128) ^ 128;
    y = n * n;
    if ((x + y) <= r)
      ++b;
  }
  b = 4 * b;
  stop = PEEK(tick) + (PEEK(tack) * 256);
  printf("%d%c%c%c%c%c%c\n", b, 30, 30, 30, 30, 255, 46);
  printf("%d ticks\n", stop);

  infinite:
    goto infinite;
}

I expected:

; 8x8 multiplication routine

mul8x8:
        lsr     ptr4            ; Get first bit into carry
@L0:    bcc     @L1
        clc
        adc     ptr1
@L1:    ror
        ror     ptr4
        dey
        bne     @L0
        tax
        lda     ptr4            ; Load the result
        rts    

byte*byte produces a byte, this is by design. An arithmetic operator never promotes the result to a type larger that the type of its arguments.

In order to get a word, you need to explicitly cast one of the arguments to word: x = n*word(n)
This causes a call to __mul_u16u8u16, which is defined in m6502/zp_reg.mfk.
The same file also contains __mul_u16u16u16 and __mul_u8u8u8, plus all the division and modulo implementations.

zbyti commented

OK, thanks. I'll remember this concept.