summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPalmer Dabbelt <palmer@dabbelt.com>2017-11-17 12:14:52 -0800
committerPalmer Dabbelt <palmer@dabbelt.com>2017-11-17 15:24:01 -0800
commit401b704e73599a36bfdc8c778dab85f94d74ed1d (patch)
tree5d4d4851f1a9ff03864ab87de11bacc05804ddbd
parentb53187a0434fe5e8dce288f55cfca36b292552e4 (diff)
Speed up Dhrystone on the HiFive1
There's a handful of things that went wrong here: * The read-only data sections were mapped to flash, which is very slow. I just put them in the data segment, so they end up in the scratchpad. This is about a 10x hit, so it's really important. * The toolchain was an old version, which didn't have a fast memcpy implementation on 32-bit systems. This is about a 2x hit. * Some compiler flags were incorrect, including * -Os instead of -O3 * Missing -mexplicit-relocs * Missing -DNOENUM * Missing -falign-functions=4 I haven't checked how much those hurt With this, I get $ make software BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone $ make upload BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone Execution starts, 10000000 runs through Dhrystone Execution ends Final values of the variables used in the benchmark: Int_Glob: 5 should be: 5 Bool_Glob: 1 should be: 1 Ch_1_Glob: A should be: A Ch_2_Glob: B should be: B Arr_1_Glob[8]: 7 should be: 7 Arr_2_Glob[8][7]: 10000010 should be: Number_Of_Runs + 10 Ptr_Glob-> Ptr_Comp: -2147470264 should be: (implementation-dependent) Discr: 0 should be: 0 Enum_Comp: 2 should be: 2 Int_Comp: 17 should be: 17 Str_Comp: DHRYSTONE PROGRAM, SOME STRING should be: DHRYSTONE PROGRAM, SOME STRING Next_Ptr_Glob-> Ptr_Comp: -2147470264 should be: (implementation-dependent), same as above Discr: 0 should be: 0 Enum_Comp: 1 should be: 1 Int_Comp: 18 should be: 18 Str_Comp: DHRYSTONE PROGRAM, SOME STRING should be: DHRYSTONE PROGRAM, SOME STRING Int_1_Loc: 5 should be: 5 Int_2_Loc: 13 should be: 13 Int_3_Loc: 7 should be: 7 Enum_Loc: 1 should be: 1 Str_1_Loc: DHRYSTONE PROGRAM, 1'ST STRING should be: DHRYSTONE PROGRAM, 1'ST STRING Str_2_Loc: DHRYSTONE PROGRAM, 2'ND STRING should be: DHRYSTONE PROGRAM, 2'ND STRING Microseconds for one run through Dhrystone: 1.3 Dhrystones per Second: 714285.6 which is 1.55 DMIPS/MHz at 262 MHz. It's still a bit slower than our current stuff, but I don't remember what was actually in the HiFive1 so I'm not sure what we should be getting. I verified the clock is accurate with a stopwatch. I haven't bothered to go look through the binary, but I think we're about 10 cycles off so it should be managable.
-rw-r--r--bsp/env/freedom-e300-hifive1/dhrystone.lds157
m---------riscv-gnu-toolchain0
-rw-r--r--software/dhrystone/Makefile4
3 files changed, 159 insertions, 2 deletions
diff --git a/bsp/env/freedom-e300-hifive1/dhrystone.lds b/bsp/env/freedom-e300-hifive1/dhrystone.lds
new file mode 100644
index 0000000..cc9cd9b
--- /dev/null
+++ b/bsp/env/freedom-e300-hifive1/dhrystone.lds
@@ -0,0 +1,157 @@
+OUTPUT_ARCH( "riscv" )
+
+ENTRY( _start )
+
+MEMORY
+{
+ flash (rxai!w) : ORIGIN = 0x20400000, LENGTH = 512M
+ ram (wxa!ri) : ORIGIN = 0x80000000, LENGTH = 16K
+}
+
+PHDRS
+{
+ flash PT_LOAD;
+ ram_init PT_LOAD;
+ ram PT_NULL;
+}
+
+SECTIONS
+{
+ __stack_size = DEFINED(__stack_size) ? __stack_size : 2K;
+
+ .init :
+ {
+ KEEP (*(SORT_NONE(.init)))
+ } >flash AT>flash :flash
+
+ .text :
+ {
+ *(.text.unlikely .text.unlikely.*)
+ *(.text.startup .text.startup.*)
+ *(.text .text.*)
+ *(.gnu.linkonce.t.*)
+ } >flash AT>flash :flash
+
+ .fini :
+ {
+ KEEP (*(SORT_NONE(.fini)))
+ } >flash AT>flash :flash
+
+ PROVIDE (__etext = .);
+ PROVIDE (_etext = .);
+ PROVIDE (etext = .);
+
+ . = ALIGN(4);
+
+ .preinit_array :
+ {
+ PROVIDE_HIDDEN (__preinit_array_start = .);
+ KEEP (*(.preinit_array))
+ PROVIDE_HIDDEN (__preinit_array_end = .);
+ } >flash AT>flash :flash
+
+ .init_array :
+ {
+ PROVIDE_HIDDEN (__init_array_start = .);
+ KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+ KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
+ PROVIDE_HIDDEN (__init_array_end = .);
+ } >flash AT>flash :flash
+
+ .fini_array :
+ {
+ PROVIDE_HIDDEN (__fini_array_start = .);
+ KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+ KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
+ PROVIDE_HIDDEN (__fini_array_end = .);
+ } >flash AT>flash :flash
+
+ .ctors :
+ {
+ /* gcc uses crtbegin.o to find the start of
+ the constructors, so we make sure it is
+ first. Because this is a wildcard, it
+ doesn't matter if the user does not
+ actually link against crtbegin.o; the
+ linker won't look for a file to match a
+ wildcard. The wildcard also means that it
+ doesn't matter which directory crtbegin.o
+ is in. */
+ KEEP (*crtbegin.o(.ctors))
+ KEEP (*crtbegin?.o(.ctors))
+ /* We don't want to include the .ctor section from
+ the crtend.o file until after the sorted ctors.
+ The .ctor section from the crtend file contains the
+ end of ctors marker and it must be last */
+ KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
+ KEEP (*(SORT(.ctors.*)))
+ KEEP (*(.ctors))
+ } >flash AT>flash :flash
+
+ .dtors :
+ {
+ KEEP (*crtbegin.o(.dtors))
+ KEEP (*crtbegin?.o(.dtors))
+ KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
+ KEEP (*(SORT(.dtors.*)))
+ KEEP (*(.dtors))
+ } >flash AT>flash :flash
+
+ .lalign :
+ {
+ . = ALIGN(4);
+ PROVIDE( _data_lma = . );
+ } >flash AT>flash :flash
+
+ .dalign :
+ {
+ . = ALIGN(4);
+ PROVIDE( _data = . );
+ } >ram AT>flash :ram_init
+
+ .data :
+ {
+ *(.rdata)
+ *(.rodata .rodata.*)
+ *(.gnu.linkonce.r.*)
+ *(.data .data.*)
+ *(.gnu.linkonce.d.*)
+ . = ALIGN(8);
+ PROVIDE( __global_pointer$ = . + 0x800 );
+ *(.sdata .sdata.*)
+ *(.gnu.linkonce.s.*)
+ . = ALIGN(8);
+ *(.srodata.cst16)
+ *(.srodata.cst8)
+ *(.srodata.cst4)
+ *(.srodata.cst2)
+ *(.srodata .srodata.*)
+ } >ram AT>flash :ram_init
+
+ . = ALIGN(4);
+ PROVIDE( _edata = . );
+ PROVIDE( edata = . );
+
+ PROVIDE( _fbss = . );
+ PROVIDE( __bss_start = . );
+ .bss :
+ {
+ *(.sbss*)
+ *(.gnu.linkonce.sb.*)
+ *(.bss .bss.*)
+ *(.gnu.linkonce.b.*)
+ *(COMMON)
+ . = ALIGN(4);
+ } >ram AT>ram :ram
+
+ . = ALIGN(8);
+ PROVIDE( _end = . );
+ PROVIDE( end = . );
+
+ .stack ORIGIN(ram) + LENGTH(ram) - __stack_size :
+ {
+ PROVIDE( _heap_end = . );
+ . = __stack_size;
+ PROVIDE( _sp = . );
+ } >ram AT>ram :ram
+}
diff --git a/riscv-gnu-toolchain b/riscv-gnu-toolchain
-Subproject 65cb174d37e2efe99e094eb317113442c79ed3c
+Subproject bf5697a1a6509705b50dcc1f67b8c620a7b21ec
diff --git a/software/dhrystone/Makefile b/software/dhrystone/Makefile
index d401720..4602653 100644
--- a/software/dhrystone/Makefile
+++ b/software/dhrystone/Makefile
@@ -5,10 +5,10 @@ C_SRCS := dhry_stubs.c dhry_printf.c
HEADERS := dhry.h
DHRY_SRCS := dhry_1.c dhry_2.c
-DHRY_CFLAGS := -O2 -DTIME -fno-inline -fno-builtin-printf -Wno-implicit -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
+DHRY_CFLAGS := -O3 -DTIME -fno-inline -fno-builtin-printf -Wno-implicit -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
XLEN ?= 32
-CFLAGS := -Os -fno-common -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
+CFLAGS := -O3 -fno-common -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -mexplicit-relocs -DNOENUM -falign-functions=4
LDFLAGS := -Wl,--wrap=scanf -Wl,--wrap=printf -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -mcmodel=medany
DHRY_OBJS := $(patsubst %.c,%.o,$(DHRY_SRCS))