From 401b704e73599a36bfdc8c778dab85f94d74ed1d Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Fri, 17 Nov 2017 12:14:52 -0800
Subject: Speed up Dhrystone on the HiFive1

There's a handful of things that went wrong here:

* The read-only data sections were mapped to flash, which is very slow.
  I just put them in the data segment, so they end up in the scratchpad.
  This is about a 10x hit, so it's really important.
* The toolchain was an old version, which didn't have a fast memcpy
  implementation on 32-bit systems.  This is about a 2x hit.
* Some compiler flags were incorrect, including
    * -Os instead of -O3
    * Missing -mexplicit-relocs
    * Missing -DNOENUM
    * Missing -falign-functions=4
  I haven't checked how much those hurt

With this, I get

$ make software BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone
$ make upload BOARD=freedom-e300-hifive1 PROGRAM=dhrystone LINK_TARGET=dhrystone
Execution starts, 10000000 runs through Dhrystone
Execution ends

Final values of the variables used in the benchmark:

Int_Glob:            5
        should be:   5
Bool_Glob:           1
        should be:   1
Ch_1_Glob:           A
        should be:   A
Ch_2_Glob:           B
        should be:   B
Arr_1_Glob[8]:       7
        should be:   7
Arr_2_Glob[8][7]:    10000010
        should be:   Number_Of_Runs + 10
Ptr_Glob->
  Ptr_Comp:          -2147470264
        should be:   (implementation-dependent)
  Discr:             0
        should be:   0
  Enum_Comp:         2
        should be:   2
  Int_Comp:          17
        should be:   17
  Str_Comp:          DHRYSTONE PROGRAM, SOME STRING
        should be:   DHRYSTONE PROGRAM, SOME STRING
Next_Ptr_Glob->
  Ptr_Comp:          -2147470264
        should be:   (implementation-dependent), same as above
  Discr:             0
        should be:   0
  Enum_Comp:         1
        should be:   1
  Int_Comp:          18
        should be:   18
  Str_Comp:          DHRYSTONE PROGRAM, SOME STRING
        should be:   DHRYSTONE PROGRAM, SOME STRING
Int_1_Loc:           5
        should be:   5
Int_2_Loc:           13
        should be:   13
Int_3_Loc:           7
        should be:   7
Enum_Loc:            1
        should be:   1
Str_1_Loc:           DHRYSTONE PROGRAM, 1'ST STRING
        should be:   DHRYSTONE PROGRAM, 1'ST STRING
Str_2_Loc:           DHRYSTONE PROGRAM, 2'ND STRING
        should be:   DHRYSTONE PROGRAM, 2'ND STRING

Microseconds for one run through Dhrystone: 1.3
Dhrystones per Second:                      714285.6

which is 1.55 DMIPS/MHz at 262 MHz.  It's still a bit slower than our
current stuff, but I don't remember what was actually in the HiFive1 so
I'm not sure what we should be getting.  I verified the clock is
accurate with a stopwatch.  I haven't bothered to go look through the
binary, but I think we're about 10 cycles off so it should be managable.
---
 bsp/env/freedom-e300-hifive1/dhrystone.lds | 157 +++++++++++++++++++++++++++++
 riscv-gnu-toolchain                        |   2 +-
 software/dhrystone/Makefile                |   4 +-
 3 files changed, 160 insertions(+), 3 deletions(-)
 create mode 100644 bsp/env/freedom-e300-hifive1/dhrystone.lds

diff --git a/bsp/env/freedom-e300-hifive1/dhrystone.lds b/bsp/env/freedom-e300-hifive1/dhrystone.lds
new file mode 100644
index 0000000..cc9cd9b
--- /dev/null
+++ b/bsp/env/freedom-e300-hifive1/dhrystone.lds
@@ -0,0 +1,157 @@
+OUTPUT_ARCH( "riscv" )
+
+ENTRY( _start )
+
+MEMORY
+{
+  flash (rxai!w) : ORIGIN = 0x20400000, LENGTH = 512M
+  ram (wxa!ri) : ORIGIN = 0x80000000, LENGTH = 16K
+}
+
+PHDRS
+{
+  flash PT_LOAD;
+  ram_init PT_LOAD;
+  ram PT_NULL;
+}
+
+SECTIONS
+{
+  __stack_size = DEFINED(__stack_size) ? __stack_size : 2K;
+
+  .init           :
+  {
+    KEEP (*(SORT_NONE(.init)))
+  } >flash AT>flash :flash
+
+  .text           :
+  {
+    *(.text.unlikely .text.unlikely.*)
+    *(.text.startup .text.startup.*)
+    *(.text .text.*)
+    *(.gnu.linkonce.t.*)
+  } >flash AT>flash :flash
+
+  .fini           :
+  {
+    KEEP (*(SORT_NONE(.fini)))
+  } >flash AT>flash :flash
+
+  PROVIDE (__etext = .);
+  PROVIDE (_etext = .);
+  PROVIDE (etext = .);
+
+  . = ALIGN(4);
+
+  .preinit_array  :
+  {
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP (*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+  } >flash AT>flash :flash
+
+  .init_array     :
+  {
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+    KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
+    PROVIDE_HIDDEN (__init_array_end = .);
+  } >flash AT>flash :flash
+
+  .fini_array     :
+  {
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+    KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+  } >flash AT>flash :flash
+
+  .ctors          :
+  {
+    /* gcc uses crtbegin.o to find the start of
+       the constructors, so we make sure it is
+       first.  Because this is a wildcard, it
+       doesn't matter if the user does not
+       actually link against crtbegin.o; the
+       linker won't look for a file to match a
+       wildcard.  The wildcard also means that it
+       doesn't matter which directory crtbegin.o
+       is in.  */
+    KEEP (*crtbegin.o(.ctors))
+    KEEP (*crtbegin?.o(.ctors))
+    /* We don't want to include the .ctor section from
+       the crtend.o file until after the sorted ctors.
+       The .ctor section from the crtend file contains the
+       end of ctors marker and it must be last */
+    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
+    KEEP (*(SORT(.ctors.*)))
+    KEEP (*(.ctors))
+  } >flash AT>flash :flash
+
+  .dtors          :
+  {
+    KEEP (*crtbegin.o(.dtors))
+    KEEP (*crtbegin?.o(.dtors))
+    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
+    KEEP (*(SORT(.dtors.*)))
+    KEEP (*(.dtors))
+  } >flash AT>flash :flash
+
+  .lalign         :
+  {
+    . = ALIGN(4);
+    PROVIDE( _data_lma = . );
+  } >flash AT>flash :flash
+
+  .dalign         :
+  {
+    . = ALIGN(4);
+    PROVIDE( _data = . );
+  } >ram AT>flash :ram_init
+
+  .data          :
+  {
+    *(.rdata)
+    *(.rodata .rodata.*)
+    *(.gnu.linkonce.r.*)
+    *(.data .data.*)
+    *(.gnu.linkonce.d.*)
+    . = ALIGN(8);
+    PROVIDE( __global_pointer$ = . + 0x800 );
+    *(.sdata .sdata.*)
+    *(.gnu.linkonce.s.*)
+    . = ALIGN(8);
+    *(.srodata.cst16)
+    *(.srodata.cst8)
+    *(.srodata.cst4)
+    *(.srodata.cst2)
+    *(.srodata .srodata.*)
+  } >ram AT>flash :ram_init
+
+  . = ALIGN(4);
+  PROVIDE( _edata = . );
+  PROVIDE( edata = . );
+
+  PROVIDE( _fbss = . );
+  PROVIDE( __bss_start = . );
+  .bss            :
+  {
+    *(.sbss*)
+    *(.gnu.linkonce.sb.*)
+    *(.bss .bss.*)
+    *(.gnu.linkonce.b.*)
+    *(COMMON)
+    . = ALIGN(4);
+  } >ram AT>ram :ram
+
+  . = ALIGN(8);
+  PROVIDE( _end = . );
+  PROVIDE( end = . );
+
+  .stack ORIGIN(ram) + LENGTH(ram) - __stack_size :
+  {
+    PROVIDE( _heap_end = . );
+    . = __stack_size;
+    PROVIDE( _sp = . );
+  } >ram AT>ram :ram
+}
diff --git a/riscv-gnu-toolchain b/riscv-gnu-toolchain
index 65cb174..bf5697a 160000
--- a/riscv-gnu-toolchain
+++ b/riscv-gnu-toolchain
@@ -1 +1 @@
-Subproject commit 65cb174d37e2efe99e094eb317113442c79ed3ce
+Subproject commit bf5697a1a6509705b50dcc1f67b8c620a7b21ec4
diff --git a/software/dhrystone/Makefile b/software/dhrystone/Makefile
index d401720..4602653 100644
--- a/software/dhrystone/Makefile
+++ b/software/dhrystone/Makefile
@@ -5,10 +5,10 @@ C_SRCS := dhry_stubs.c dhry_printf.c
 HEADERS := dhry.h
 
 DHRY_SRCS := dhry_1.c dhry_2.c
-DHRY_CFLAGS := -O2 -DTIME -fno-inline -fno-builtin-printf -Wno-implicit -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
+DHRY_CFLAGS := -O3 -DTIME -fno-inline -fno-builtin-printf -Wno-implicit -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
 
 XLEN ?= 32
-CFLAGS := -Os -fno-common  -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
+CFLAGS := -O3 -fno-common  -mcmodel=medany -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -mexplicit-relocs -DNOENUM -falign-functions=4
 LDFLAGS := -Wl,--wrap=scanf -Wl,--wrap=printf  -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -mcmodel=medany
 
 DHRY_OBJS := $(patsubst %.c,%.o,$(DHRY_SRCS))
-- 
cgit v1.2.3