From 401b704e73599a36bfdc8c778dab85f94d74ed1d Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 17 Nov 2017 12:14:52 -0800 Subject: Speed up Dhrystone on the HiFive1 There's a handful of things that went wrong here: * The read-only data sections were mapped to flash, which is very slow. I just put them in the data segment, so they end up in the scratchpad. This is about a 10x hit, so it's really important. * The toolchain was an old version, which didn't have a fast memcpy implementation on 32-bit systems. This is about a 2x hit. * Some compiler flags were incorrect, including * -Os instead of -O3 * Missing -mexplicit-relocs * Missing -DNOENUM * Missing -falign-functions=4 I haven't checked how much those hurt With this, I get $ make software BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone $ make upload BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone Execution starts, 10000000 runs through Dhrystone Execution ends Final values of the variables used in the benchmark: Int_Glob: 5 should be: 5 Bool_Glob: 1 should be: 1 Ch_1_Glob: A should be: A Ch_2_Glob: B should be: B Arr_1_Glob[8]: 7 should be: 7 Arr_2_Glob[8][7]: 10000010 should be: Number_Of_Runs + 10 Ptr_Glob-> Ptr_Comp: -2147470264 should be: (implementation-dependent) Discr: 0 should be: 0 Enum_Comp: 2 should be: 2 Int_Comp: 17 should be: 17 Str_Comp: DHRYSTONE PROGRAM, SOME STRING should be: DHRYSTONE PROGRAM, SOME STRING Next_Ptr_Glob-> Ptr_Comp: -2147470264 should be: (implementation-dependent), same as above Discr: 0 should be: 0 Enum_Comp: 1 should be: 1 Int_Comp: 18 should be: 18 Str_Comp: DHRYSTONE PROGRAM, SOME STRING should be: DHRYSTONE PROGRAM, SOME STRING Int_1_Loc: 5 should be: 5 Int_2_Loc: 13 should be: 13 Int_3_Loc: 7 should be: 7 Enum_Loc: 1 should be: 1 Str_1_Loc: DHRYSTONE PROGRAM, 1'ST STRING should be: DHRYSTONE PROGRAM, 1'ST STRING Str_2_Loc: DHRYSTONE PROGRAM, 2'ND STRING should be: DHRYSTONE PROGRAM, 2'ND STRING Microseconds for one run through Dhrystone: 1.3 Dhrystones per Second: 714285.6 which is 1.55 DMIPS/MHz at 262 MHz. It's still a bit slower than our current stuff, but I don't remember what was actually in the HiFive1 so I'm not sure what we should be getting. I verified the clock is accurate with a stopwatch. I haven't bothered to go look through the binary, but I think we're about 10 cycles off so it should be managable. --- bsp/env/freedom-e300-hifive1/dhrystone.lds | 157 +++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 bsp/env/freedom-e300-hifive1/dhrystone.lds (limited to 'bsp/env/freedom-e300-hifive1') diff --git a/bsp/env/freedom-e300-hifive1/dhrystone.lds b/bsp/env/freedom-e300-hifive1/dhrystone.lds new file mode 100644 index 0000000..cc9cd9b --- /dev/null +++ b/bsp/env/freedom-e300-hifive1/dhrystone.lds @@ -0,0 +1,157 @@ +OUTPUT_ARCH( "riscv" ) + +ENTRY( _start ) + +MEMORY +{ + flash (rxai!w) : ORIGIN = 0x20400000, LENGTH = 512M + ram (wxa!ri) : ORIGIN = 0x80000000, LENGTH = 16K +} + +PHDRS +{ + flash PT_LOAD; + ram_init PT_LOAD; + ram PT_NULL; +} + +SECTIONS +{ + __stack_size = DEFINED(__stack_size) ? __stack_size : 2K; + + .init : + { + KEEP (*(SORT_NONE(.init))) + } >flash AT>flash :flash + + .text : + { + *(.text.unlikely .text.unlikely.*) + *(.text.startup .text.startup.*) + *(.text .text.*) + *(.gnu.linkonce.t.*) + } >flash AT>flash :flash + + .fini : + { + KEEP (*(SORT_NONE(.fini))) + } >flash AT>flash :flash + + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + + . = ALIGN(4); + + .preinit_array : + { + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + } >flash AT>flash :flash + + .init_array : + { + PROVIDE_HIDDEN (__init_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))) + KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)) + PROVIDE_HIDDEN (__init_array_end = .); + } >flash AT>flash :flash + + .fini_array : + { + PROVIDE_HIDDEN (__fini_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*))) + KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors)) + PROVIDE_HIDDEN (__fini_array_end = .); + } >flash AT>flash :flash + + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin.o(.ctors)) + KEEP (*crtbegin?.o(.ctors)) + /* We don't want to include the .ctor section from + the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } >flash AT>flash :flash + + .dtors : + { + KEEP (*crtbegin.o(.dtors)) + KEEP (*crtbegin?.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } >flash AT>flash :flash + + .lalign : + { + . = ALIGN(4); + PROVIDE( _data_lma = . ); + } >flash AT>flash :flash + + .dalign : + { + . = ALIGN(4); + PROVIDE( _data = . ); + } >ram AT>flash :ram_init + + .data : + { + *(.rdata) + *(.rodata .rodata.*) + *(.gnu.linkonce.r.*) + *(.data .data.*) + *(.gnu.linkonce.d.*) + . = ALIGN(8); + PROVIDE( __global_pointer$ = . + 0x800 ); + *(.sdata .sdata.*) + *(.gnu.linkonce.s.*) + . = ALIGN(8); + *(.srodata.cst16) + *(.srodata.cst8) + *(.srodata.cst4) + *(.srodata.cst2) + *(.srodata .srodata.*) + } >ram AT>flash :ram_init + + . = ALIGN(4); + PROVIDE( _edata = . ); + PROVIDE( edata = . ); + + PROVIDE( _fbss = . ); + PROVIDE( __bss_start = . ); + .bss : + { + *(.sbss*) + *(.gnu.linkonce.sb.*) + *(.bss .bss.*) + *(.gnu.linkonce.b.*) + *(COMMON) + . = ALIGN(4); + } >ram AT>ram :ram + + . = ALIGN(8); + PROVIDE( _end = . ); + PROVIDE( end = . ); + + .stack ORIGIN(ram) + LENGTH(ram) - __stack_size : + { + PROVIDE( _heap_end = . ); + . = __stack_size; + PROVIDE( _sp = . ); + } >ram AT>ram :ram +} -- cgit v1.2.3