diff options
| author | Palmer Dabbelt <palmer@dabbelt.com> | 2017-11-17 12:14:52 -0800 | 
|---|---|---|
| committer | Palmer Dabbelt <palmer@dabbelt.com> | 2017-11-17 15:24:01 -0800 | 
| commit | 401b704e73599a36bfdc8c778dab85f94d74ed1d (patch) | |
| tree | 5d4d4851f1a9ff03864ab87de11bacc05804ddbd /bsp | |
| parent | b53187a0434fe5e8dce288f55cfca36b292552e4 (diff) | |
Speed up Dhrystone on the HiFive1
There's a handful of things that went wrong here:
* The read-only data sections were mapped to flash, which is very slow.
  I just put them in the data segment, so they end up in the scratchpad.
  This is about a 10x hit, so it's really important.
* The toolchain was an old version, which didn't have a fast memcpy
  implementation on 32-bit systems.  This is about a 2x hit.
* Some compiler flags were incorrect, including
    * -Os instead of -O3
    * Missing -mexplicit-relocs
    * Missing -DNOENUM
    * Missing -falign-functions=4
  I haven't checked how much those hurt
With this, I get
$ make software BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone
$ make upload BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone
Execution starts, 10000000 runs through Dhrystone
Execution ends
Final values of the variables used in the benchmark:
Int_Glob:            5
        should be:   5
Bool_Glob:           1
        should be:   1
Ch_1_Glob:           A
        should be:   A
Ch_2_Glob:           B
        should be:   B
Arr_1_Glob[8]:       7
        should be:   7
Arr_2_Glob[8][7]:    10000010
        should be:   Number_Of_Runs + 10
Ptr_Glob->
  Ptr_Comp:          -2147470264
        should be:   (implementation-dependent)
  Discr:             0
        should be:   0
  Enum_Comp:         2
        should be:   2
  Int_Comp:          17
        should be:   17
  Str_Comp:          DHRYSTONE PROGRAM, SOME STRING
        should be:   DHRYSTONE PROGRAM, SOME STRING
Next_Ptr_Glob->
  Ptr_Comp:          -2147470264
        should be:   (implementation-dependent), same as above
  Discr:             0
        should be:   0
  Enum_Comp:         1
        should be:   1
  Int_Comp:          18
        should be:   18
  Str_Comp:          DHRYSTONE PROGRAM, SOME STRING
        should be:   DHRYSTONE PROGRAM, SOME STRING
Int_1_Loc:           5
        should be:   5
Int_2_Loc:           13
        should be:   13
Int_3_Loc:           7
        should be:   7
Enum_Loc:            1
        should be:   1
Str_1_Loc:           DHRYSTONE PROGRAM, 1'ST STRING
        should be:   DHRYSTONE PROGRAM, 1'ST STRING
Str_2_Loc:           DHRYSTONE PROGRAM, 2'ND STRING
        should be:   DHRYSTONE PROGRAM, 2'ND STRING
Microseconds for one run through Dhrystone: 1.3
Dhrystones per Second:                      714285.6
which is 1.55 DMIPS/MHz at 262 MHz.  It's still a bit slower than our
current stuff, but I don't remember what was actually in the HiFive1 so
I'm not sure what we should be getting.  I verified the clock is
accurate with a stopwatch.  I haven't bothered to go look through the
binary, but I think we're about 10 cycles off so it should be managable.
Diffstat (limited to 'bsp')
| -rw-r--r-- | bsp/env/freedom-e300-hifive1/dhrystone.lds | 157 | 
1 files changed, 157 insertions, 0 deletions
diff --git a/bsp/env/freedom-e300-hifive1/dhrystone.lds b/bsp/env/freedom-e300-hifive1/dhrystone.lds new file mode 100644 index 0000000..cc9cd9b --- /dev/null +++ b/bsp/env/freedom-e300-hifive1/dhrystone.lds @@ -0,0 +1,157 @@ +OUTPUT_ARCH( "riscv" ) + +ENTRY( _start ) + +MEMORY +{ +  flash (rxai!w) : ORIGIN = 0x20400000, LENGTH = 512M +  ram (wxa!ri) : ORIGIN = 0x80000000, LENGTH = 16K +} + +PHDRS +{ +  flash PT_LOAD; +  ram_init PT_LOAD; +  ram PT_NULL; +} + +SECTIONS +{ +  __stack_size = DEFINED(__stack_size) ? __stack_size : 2K; + +  .init           : +  { +    KEEP (*(SORT_NONE(.init))) +  } >flash AT>flash :flash + +  .text           : +  { +    *(.text.unlikely .text.unlikely.*) +    *(.text.startup .text.startup.*) +    *(.text .text.*) +    *(.gnu.linkonce.t.*) +  } >flash AT>flash :flash + +  .fini           : +  { +    KEEP (*(SORT_NONE(.fini))) +  } >flash AT>flash :flash + +  PROVIDE (__etext = .); +  PROVIDE (_etext = .); +  PROVIDE (etext = .); + +  . = ALIGN(4); + +  .preinit_array  : +  { +    PROVIDE_HIDDEN (__preinit_array_start = .); +    KEEP (*(.preinit_array)) +    PROVIDE_HIDDEN (__preinit_array_end = .); +  } >flash AT>flash :flash + +  .init_array     : +  { +    PROVIDE_HIDDEN (__init_array_start = .); +    KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))) +    KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)) +    PROVIDE_HIDDEN (__init_array_end = .); +  } >flash AT>flash :flash + +  .fini_array     : +  { +    PROVIDE_HIDDEN (__fini_array_start = .); +    KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*))) +    KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors)) +    PROVIDE_HIDDEN (__fini_array_end = .); +  } >flash AT>flash :flash + +  .ctors          : +  { +    /* gcc uses crtbegin.o to find the start of +       the constructors, so we make sure it is +       first.  Because this is a wildcard, it +       doesn't matter if the user does not +       actually link against crtbegin.o; the +       linker won't look for a file to match a +       wildcard.  The wildcard also means that it +       doesn't matter which directory crtbegin.o +       is in.  */ +    KEEP (*crtbegin.o(.ctors)) +    KEEP (*crtbegin?.o(.ctors)) +    /* We don't want to include the .ctor section from +       the crtend.o file until after the sorted ctors. +       The .ctor section from the crtend file contains the +       end of ctors marker and it must be last */ +    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors)) +    KEEP (*(SORT(.ctors.*))) +    KEEP (*(.ctors)) +  } >flash AT>flash :flash + +  .dtors          : +  { +    KEEP (*crtbegin.o(.dtors)) +    KEEP (*crtbegin?.o(.dtors)) +    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors)) +    KEEP (*(SORT(.dtors.*))) +    KEEP (*(.dtors)) +  } >flash AT>flash :flash + +  .lalign         : +  { +    . = ALIGN(4); +    PROVIDE( _data_lma = . ); +  } >flash AT>flash :flash + +  .dalign         : +  { +    . = ALIGN(4); +    PROVIDE( _data = . ); +  } >ram AT>flash :ram_init + +  .data          : +  { +    *(.rdata) +    *(.rodata .rodata.*) +    *(.gnu.linkonce.r.*) +    *(.data .data.*) +    *(.gnu.linkonce.d.*) +    . = ALIGN(8); +    PROVIDE( __global_pointer$ = . + 0x800 ); +    *(.sdata .sdata.*) +    *(.gnu.linkonce.s.*) +    . = ALIGN(8); +    *(.srodata.cst16) +    *(.srodata.cst8) +    *(.srodata.cst4) +    *(.srodata.cst2) +    *(.srodata .srodata.*) +  } >ram AT>flash :ram_init + +  . = ALIGN(4); +  PROVIDE( _edata = . ); +  PROVIDE( edata = . ); + +  PROVIDE( _fbss = . ); +  PROVIDE( __bss_start = . ); +  .bss            : +  { +    *(.sbss*) +    *(.gnu.linkonce.sb.*) +    *(.bss .bss.*) +    *(.gnu.linkonce.b.*) +    *(COMMON) +    . = ALIGN(4); +  } >ram AT>ram :ram + +  . = ALIGN(8); +  PROVIDE( _end = . ); +  PROVIDE( end = . ); + +  .stack ORIGIN(ram) + LENGTH(ram) - __stack_size : +  { +    PROVIDE( _heap_end = . ); +    . = __stack_size; +    PROVIDE( _sp = . ); +  } >ram AT>ram :ram +}  | 
