byte multiplication problem
zbyti opened this issue · 5 comments
const word probe = 9999
const word radius = 127 * 127
pointer screen @ $80
asm void pause() {
lda os_RTCLOK.b2
.rt_check:
cmp os_RTCLOK.b2
beq .rt_check
rts
}
// print in HEX
void printScore(word val) {
array(byte) tmp[4]
byte iter
tmp[0] = val.hi >> 4
tmp[1] = val.hi & %00001111
tmp[2] = val.lo >> 4
tmp[3] = val.lo & %00001111
for iter:tmp {
if tmp[iter] < 10 {
screen[iter] = tmp[iter] + $10
} else {
screen[iter] = tmp[iter] + $17
}
}
screen += 40
}
void main() {
array(bool) flags[size] align(1024)
word i@$e0, bingo@$e2
word x@$e4, y@$e6, p@$e8, t@$ea
byte n@$82
screen = os_SAVMSC
x = 0
y = 0
bingo = 0
pause()
os_RTCLOK = 0
for i,0,to,probe {
n = pokey_random & 127
x = n * n
n = pokey_random & 127
y = n * n
if ((x + y) <= radius) {
bingo += 1
}
}
p = 4 * bingo
t = os_RTCLOK.b2 + (os_RTCLOK.b1 * 256)
printScore(t)
printScore(p)
while true {}
}
If n
is word
everything is OK but byte
produce wrong score, n * n
is not promoted to word
.
I am aware of the difference in single results, but what interests me is the multiplication procedure implemented in a given language, then you can see a significant difference.
In this benchmark Mad Pascal used:
/*
;
; Ullrich von Bassewitz, 2009-08-17
;
; CC65 runtime: 8x8 => 16 unsigned multiplication
;
*/
.proc imulCL
ptr1 = ecx
ptr4 = eax
ldy #8
lda #0
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc ptr1
@L1: ror @
ror ptr4
dey
bne @L0
sta ptr4+1
rts
.endp
program MonteCarloPi;
uses crt;
{$define FAST}
var
rtClock1 : byte absolute 19;
rtClock2 : byte absolute 20;
rndNumber : byte absolute $D20A;
{$ifdef FAST}
stop : word absolute $e0;
i : word absolute $e0;
r : word absolute $e2;
x : word absolute $e4;
y : word absolute $e6;
bingo : word absolute $e8;
probe : word absolute $ea;
foundPi : word absolute $ec;
n : byte absolute $ee;
{$else}
stop, i, r, x, y : word;
bingo, probe, foundPi : word;
n : byte;
{$endif}
begin
bingo := 0;
r := 127 * 127;
probe := 10000;
Pause;
rtClock1 := 0; rtClock2 := 0;
for i := 0 to probe do begin
n := rndNumber and 127;
x := n * n;
n := rndNumber and 127;
y := n * n;
if (x + y) <= r then Inc(bingo);
end;
foundPi := 4 * bingo;
stop := (rtClock1 * 256) + rtClock2;
{$ifdef FAST}
WriteLn('Variables on zero page');
{$endif}
WriteLn('Probe size ', probe);
WriteLn('Points in circle ', bingo);
Write('Found pi approximation ', foundPi);
WriteLn(chr(30),chr(30),chr(30),chr(30),chr(255),chr(44));
WriteLn('Frames counter = ', stop);
ReadKey;
end.
@tebe6502 explained:
MP extends the type for multiplication and other operations,
for u8xu8 it assumes the result of 16b, only later during optimization
it will start to reject redundant operations
result is written to eax, eax + 1 (16b)
procedure in the tests turned out to be several scanning lines faster than the one used previously:
.proc imulCL
lda #$00
LDY #$09
CLC
LOOP ROR @
ROR eax
BCC MUL2
CLC ;DEC AUX above to remove CLC
ADC ecx
MUL2 DEY
BNE LOOP
STA eax+1
RTS
.endp
CC65 chose not optimal:
; 8x16 routine with external entry points used by the 16x16 routine in mul.s
tosmula0:
tosumula0:
sta ptr4
mul8x16:jsr popptr1 ; Get left operand (Y=0 by popptr1)
tya ; Clear byte 1
ldy #8 ; Number of bits
ldx ptr1+1 ; check if lhs is 8 bit only
beq mul8x8 ; Do 8x8 multiplication if high byte zero
mul8x16a:
sta ptr4+1 ; Clear byte 2
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc ptr1
tax
lda ptr1+1 ; hi byte of left op
adc ptr4+1
sta ptr4+1
txa
@L1: ror ptr4+1
ror a
ror ptr4
dey
bne @L0
tax
lda ptr4 ; Load the result
rts
#include <stdio.h>
#include <peekpoke.h>
#define tick 0x14
#define tack 0x13
#define rndp 0xd20a
void main(void)
{
register unsigned int stop, i, x, y;
register unsigned int b, p, r;
register unsigned char n;
b = 0;
r = 127 * 127;
p = 10000;
n = PEEK(tick);
while (PEEK(tick) == n) { ; }
printf("\nMonte-Carlo PI cc65 V2.18\n");
asm(" lda #0");
asm(" sta $13");
asm(" sta $14");
for (i = 0 ; i < p; ++i)
{
n = (PEEK(rndp) | 128) ^ 128;
x = n * n;
n = (PEEK(rndp) | 128) ^ 128;
y = n * n;
if ((x + y) <= r)
++b;
}
b = 4 * b;
stop = PEEK(tick) + (PEEK(tack) * 256);
printf("%d%c%c%c%c%c%c\n", b, 30, 30, 30, 30, 255, 46);
printf("%d ticks\n", stop);
infinite:
goto infinite;
}
I expected:
; 8x8 multiplication routine
mul8x8:
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc ptr1
@L1: ror
ror ptr4
dey
bne @L0
tax
lda ptr4 ; Load the result
rts
byte*byte
produces a byte
, this is by design. An arithmetic operator never promotes the result to a type larger that the type of its arguments.
In order to get a word
, you need to explicitly cast one of the arguments to word
: x = n*word(n)
This causes a call to __mul_u16u8u16
, which is defined in m6502/zp_reg.mfk
.
The same file also contains __mul_u16u16u16
and __mul_u8u8u8
, plus all the division and modulo implementations.
OK, thanks. I'll remember this concept.