foss-for-synopsys-dwc-arc-processors/toolchain

Busybox's hexdump and xxd tools don't work on HS58 if compiled with toolchain 2023.09

Closed this issue · 8 comments

The tools don't print anything if compiled with ARC 2023.09 toolchain. Both tools seem to be sharing the same library (libbb) for dumping binary data. Interestingly, "od" tool works fine.

Arch: HS58
Toolchain: 2023.09 (glibc or uclibc)

The easiest method for reproduction is to:

  1. Use branch https://github.com/foss-for-synopsys-dwc-arc-processors/buildroot/tree/arc-2023.09 and defconfig: snps_arc32_defconfig
  2. Compile the distro
  3. Run vmlinux under nSIM
  4. Try to dump any file using hexdump - nothing will be printed

This could be something related with optimization (either in the toolchain or in the libbb itself).
Adding #pragma GCC optimize ("O0") to the file "output/build/busybox-1.36.1/libbb/dump.c" makes the hexdump/xxd work again.

The functionality was OK with toolchain 2022.09.

Hi @gmazurkiewicz

The problem with the lack of output in the hexdump is that the compiled code does not save the result of executing the 'mac' instruction. Below is a piece of code and its disassembler with my explanations.

---hexdump print out fails---
Line 7c0dc shows that result of "cur_size += fu->bcnt * fu->reps" was lost.

static NOINLINE int bb_dump_size(FS *fs)
{
	FU *fu;
	int bcnt; 
	int cur_size;
	char *fmt;
	const char *p;
	int prec;

	/* figure out the data block size needed for each format unit */
	for (cur_size = 0, fu = fs->nextfu; fu; fu = fu->nextfu) {
		if (fu->bcnt) {
			cur_size += fu->bcnt * fu->reps;
			continue;
		}
		for (bcnt = prec = 0, fmt = fu->fmt; *fmt; ++fmt) {
....
		}
		cur_size += bcnt * fu->reps;
	}
	return cur_size;
}


0007c098 <bb_dump_size.isra.0>:
   7c098:	1cf8 b50e           	std.aw	r20r21,[sp,-8]
   7c09c:	1cf8 b48e           	std.aw	r18r19,[sp,-8]
   7c0a0:	1cf8 b40e           	std.aw	r16r17,[sp,-8]
   7c0a4:	1cf8 b38e           	std.aw	r14r15,[sp,-8]
   7c0a8:	1cfc b7c8           	st.aw	blink,[sp,-4]
   7c0ac:	4010                	mov_s	r16,r0				--> fu = fs->nextfu
   7c0ae:	706e                	mov_s	r19,0				--> r19 = cur_size = 0
   
   7c0b0:	081d 2011           	brne	r16,0,28	;7c0cc		--> fu != 0
   7c0b4:	1404 341f           	ld.ab	blink,[sp,4]
   7c0b8:	4062                	mov_s	r0,r19				--> return cur_size;
   7c0ba:	1408 358e           	ldd.ab	r14r15,[sp,8]
   7c0be:	1408 3590           	ldd.ab	r16r17,[sp,8]
   7c0c2:	1408 3592           	ldd.ab	r18r19,[sp,8]
   7c0c6:	7fe0                	j_s.d	[blink]				--> exit form bb_dump_size
   7c0c8:	1408 3594           	ldd.ab	r20r21,[sp,8]
   
   7c0cc:	1004 2612           	ld.as	r18,[r16,4]			--> r18 = fu->bcnt
   7c0d0:	220a 74c0           	mov	r58,r19					--> acc = cur_size
   7c0d4:	1003 2614           	ld.as	r20,[r16,3]			--> r20 = fu->reps
   7c0d8:	0a1b 2031           	brne.d	r18,0,26	;7c0f2		--> if (fu->bcnt != 0)
   7c0dc:	2a0e 253e           	mac	0,r18,r20			--> acc = cur_size + r18 * r20
   ......
   7c0f2:	07c1 ffef           	b.d	-64	;7c0b0			--> continue;
   7c0f6:	1000 2010           	ld	r16,[r16]			--> fu = fu->nextfu

If I add volatile for cur_size variable in source code, than we see that reading from and writing to cur_size variable goes from stack, and this fixes the issue.

---hexdump print works---
In line 7c0f0 we see that result of multiply and accumulation instruction was saved in cur_size var.

static NOINLINE int bb_dump_size(FS *fs)
{
	FU *fu;
	int bcnt;
	volatile int cur_size;
	char *fmt;
	const char *p;
	int prec;

	/* figure out the data block size needed for each format unit */
	for (cur_size = 0, fu = fs->nextfu; fu; fu = fu->nextfu) {
		if (fu->bcnt) {
			cur_size += fu->bcnt * fu->reps;
			continue;
		}
		for (bcnt = prec = 0, fmt = fu->fmt; *fmt; ++fmt) {
....
		}
		cur_size += bcnt * fu->reps;
	}
	return cur_size;
}


0007c098 <bb_dump_size.isra.0>:
   7c098:	1cfc b508           	st.aw	r20,[sp,-4]
   7c09c:	1cf8 b48e           	std.aw	r18r19,[sp,-8]
   7c0a0:	1cf8 b40e           	std.aw	r16r17,[sp,-8]
   7c0a4:	1cf8 b38e           	std.aw	r14r15,[sp,-8]
   7c0a8:	1cfc b7c8           	st.aw	blink,[sp,-4]
   7c0ac:	2458 305c           	sub2	sp,sp,0x1
   7c0b0:	4110                	mov_s	r17,r0				--> fu = fs->nextfu
   7c0b2:	1c00 3001           	st	0,[sp]				--> cur_size = 0
   
   7c0b6:	0925 2011           	brne	r17,0,36	;7c0d8		--> fu != 0
   7c0ba:	1400 3000           	ld	r0,[sp]				--> return cur_size;
   7c0be:	2455 305c           	add2	sp,sp,0x1
   7c0c2:	1404 341f           	ld.ab	blink,[sp,4]
   7c0c6:	1408 358e           	ldd.ab	r14r15,[sp,8]
   7c0ca:	1408 3590           	ldd.ab	r16r17,[sp,8]
   7c0ce:	1408 3592           	ldd.ab	r18r19,[sp,8]
   7c0d2:	7fe0                	j_s.d	[blink]				--> exit form bb_dump_size
   7c0d4:	1404 3414           	ld.ab	r20,[sp,4]
   
   7c0d8:	1104 2612           	ld.as	r18,[r17,4]			--> r18 = fu->bcnt
   7c0dc:	0a21 2030           	breq.d	r18,0,32	;7c0fc		--> if (fu->bcnt == 0)
   7c0e0:	1103 2613           	ld.as	r19,[r17,3]			--> r19 = fu->reps
   7c0e4:	1400 3000           	ld	r0,[sp]				--> r0 = cur_size
   7c0e8:	220a 7000           	mov	r58,r0				--> acc = r0 = cur_size
   7c0ec:	2a0e 24fe           	mac	0,r18,r19			--> acc = cur_size + fu->bcnt * fu->reps
   7c0f0:	1c00 3e80           	st	r58,[sp]			--> cur_size = acc
   7c0f4:	07c3 ffef           	b.d	-62	;7c0b6			--> continue;
   7c0f8:	1100 2011           	ld	r17,[r17]			--> fu = fu->nextfu
   ......
   7c0fc:	1105 260e           	ld.as	r14,[r17,5]
   .....

GCC: 12.2.1

I will give more details how to compile sources and reproduce it.

arc32-buildroot-linux-gnu-gcc --version
arc32-buildroot-linux-gnu-gcc.br_real (Buildroot 2021.11-1818-ge936f33b55-dirty) 12.2.1 20230306

libbb.zip

Thanks for the investigation. What is the plan for this? Is it going to be fixed in the toolchain?

@claziss I reduced the code to this:

int bug()
{
        int bcnt;
        int cur_size = 0;
        int i, j;

        for (i = 10; i > 0; i -= 1) {
                bcnt = i;

                if (i > 5) {
                        cur_size += i * i;
                        continue;
                }

                for (j = 10; j > 0; j -= 1) {
                         bcnt += j;
                }

                cur_size += bcnt * i;
        }

        return cur_size;
}

Save as main.c, compile using arc-2023.09 release and disassemble:

$ arc32-linux-uclibc-gcc -Os -g3 -c main.c
$ arc32-linux-uclibc-objdump -dS main.o

main.o:     file format elf32-littlearc64


Disassembly of section .text:

00000000 <bug>:
{
        int bcnt;
        int cur_size = 0;
        int i, j;

        for (i = 10; i > 0; i -= 1) {
   0:   d90a                    mov_s   r1,0xa
        int cur_size = 0;
   2:   700c                    mov_s   r0,0
   4:   da0a                    mov_s   r2,0xa
                bcnt = i;

                if (i > 5) {
                        cur_size += i * i;
   6:   220a 7000               mov     r58,r0
                if (i > 5) {
   a:   0927 01b3               brge.d  r1,0x6,38       ;2e <bug+0x2e>
   e:   290e 007e               mac     0,r1,r1 ; <-------------- Here
                bcnt = i;
  12:   4320                    mov_s   r3,r1
                        continue;
                }

                for (j = 10; j > 0; j -= 1) {
  14:   254a 0280               mov     r5,0xa
  18:   244a 0280               mov     r4,0xa
                         bcnt += j;
  1c:   73a0                    add_s   r3,r3,r5
                for (j = 10; j > 0; j -= 1) {
  1e:   248d 0000               dbnz.d  r4,0    ;1c <bug+0x1c>
  22:   2542 0045               sub     r5,r5,0x1
                }

                cur_size += bcnt * i;
  26:   220a 7000               mov     r58,r0
  2a:   2b0e 0040               mac     r0,r3,r1
        for (i = 10; i > 0; i -= 1) {
  2e:   228d 0b7f               dbnz.d  r2,-38  ;6 <bug+0x6>
  32:   2142 0041               sub     r1,r1,0x1
        }

        return cur_size;
}
  36:   7ee0                    j_s     [blink]

Seems like the problem is in incorrect registers distribution. GCC produces the same bug for HS6x target too.

I suppose that it can be fixed here: https://github.com/foss-for-synopsys-dwc-arc-processors/gcc/blob/arc-2023.09/gcc/config/arc64/arith.md#L1139. Looks like "mac\\t0,%0,%1" must be replaced by "mac\\t%0,%0,%1". I will check it and create a patch.

Seems like it's not the case. "mac\t0,%0,%1" is valid one.

Fix in:
foss-for-synopsys-dwc-arc-processors/gcc@aaef101

Thanks @kolerov for narrowing it down.

Thanks, I can confirm that fix in toolchain resolved the issue.