Last update: 15-Jul-2020
Author: R. Koucha
Using GCC sections to reorder the code
Introduction

To increase the cache locality and reduce the icache misses, it may be useful to group hot functions into memory. The ability of GCC to arrange data and code into sections can be used to make it.

1. Get the current linker script of the compiler

With an example source code, launch gcc with the linker in verbose mode:

$ gcc -o example example.c -Wl,--verbose                          
GNU ld (GNU Binutils for Ubuntu) 2.34
  Supported emulations:
   elf_x86_64
   elf32_x86_64
   elf_i386
   elf_iamcu
   elf_l1om
   elf_k1om
   i386pep
   i386pe
using internal linker script:
==================================================
/* Script for -pie -z combreloc -z separate-code -z relro -z now */
/* Copyright (C) 2014-2020 Free Software Foundation, Inc.
   Copying and distribution of this script, with or without modification,
   are permitted in any medium without royalty provided the copyright
   notice and this notice are preserved.  */
OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64",
          "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
ENTRY(_start)
SEARCH_DIR("=/usr/local/lib/x86_64-linux-gnu"); SEARCH_DIR("=/lib/x86_64-linux-gnu"); SEARCH_DIR("=/usr/lib/x86_64-linux-gnu"); SEARCH_DIR("=/usr/lib/x86_64-linux-gnu64"); SEARCH_DIR("=/usr/local/lib64"); SEARCH_DIR("=/lib64"); SEARCH_DIR("=/usr/lib64"); SEARCH_DIR("=/usr/local/lib"); SEARCH_DIR("=/lib"); SEARCH_DIR("=/usr/lib"); SEARCH_DIR("=/usr/x86_64-linux-gnu/lib64"); SEARCH_DIR("=/usr/x86_64-linux-gnu/lib");
SECTIONS
{
  PROVIDE (__executable_start = SEGMENT_START("text-segment", 0)); . = SEGMENT_START("text-segment", 0) + SIZEOF_HEADERS;
  .interp         : { *(.interp) }
  .note.gnu.build-id  : { *(.note.gnu.build-id) }
  .hash           : { *(.hash) }
  .gnu.hash       : { *(.gnu.hash) }
  .dynsym         : { *(.dynsym) }
  .dynstr         : { *(.dynstr) }
  .gnu.version    : { *(.gnu.version) }
  .gnu.version_d  : { *(.gnu.version_d) }
  .gnu.version_r  : { *(.gnu.version_r) }
  .rela.dyn       :
    {
      *(.rela.init)
      *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
      *(.rela.fini)
      *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
      *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
      *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
      *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
      *(.rela.ctors)
      *(.rela.dtors)
      *(.rela.got)
      *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
      *(.rela.ldata .rela.ldata.* .rela.gnu.linkonce.l.*)
      *(.rela.lbss .rela.lbss.* .rela.gnu.linkonce.lb.*)
      *(.rela.lrodata .rela.lrodata.* .rela.gnu.linkonce.lr.*)
      *(.rela.ifunc)
    }
  .rela.plt       :
    {
      *(.rela.plt)
      *(.rela.iplt)
    }
  . = ALIGN(CONSTANT (MAXPAGESIZE));
  .init           :
  {
    KEEP (*(SORT_NONE(.init)))
  }
  .plt            : { *(.plt) *(.iplt) }
.plt.got        : { *(.plt.got) }
.plt.sec        : { *(.plt.sec) }
  .text           :
  {
    *(.text.unlikely .text.*_unlikely .text.unlikely.*)
    *(.text.exit .text.exit.*)
    *(.text.startup .text.startup.*)
    *(.text.hot .text.hot.*)
    *(SORT(.text.sorted.*))
    *(.text .stub .text.* .gnu.linkonce.t.*)
    /* .gnu.warning sections are handled specially by elf.em.  */
    *(.gnu.warning)
  }
[...]
==================================================
[...]

The text between the "================" is the GCC's default script. Put the content into a file (e.g. myld.script).

Create the new code sections with new names (e.g. another_0 and another_1) and use the same attributes as the .text section. Add them into the SECTIONS part.

  .another_0           :
  {
    *(.text.unlikely .text.*_unlikely .text.unlikely.*)
    *(.text.exit .text.exit.*)
    *(.text.startup .text.startup.*)
    *(.text.hot .text.hot.*)
    *(SORT(.text.sorted.*))
    *(.text .stub .text.* .gnu.linkonce.t.*)
    /* .gnu.warning sections are handled specially by elf.em.  */
    *(.gnu.warning)
  }
  .another_1          :
  {
    *(.text.unlikely .text.*_unlikely .text.unlikely.*)
    *(.text.exit .text.exit.*)
    *(.text.startup .text.startup.*)
    *(.text.hot .text.hot.*)
    *(SORT(.text.sorted.*))
    *(.text .stub .text.* .gnu.linkonce.t.*)
    /* .gnu.warning sections are handled specially by elf.em.  */
    *(.gnu.warning)
  }
2. Example of program dispatching functions into sections
#include <stdio.h>
#include <unistd.h>


#if 0

#define _another_0 __attribute__ ((section ("another_0")))
#define _another_1 __attribute__ ((section ("another_1")))

#else

#define _another_0
#define _another_1

#endif


_another_0 void doing_stuff_0(void)
{

  printf("@%p: Doing some work 0...\n", (void *)doing_stuff_0);

} // doing_stuff


_another_1 void doing_stuff_1(void)
{

  printf("@%p: Doing some work 1...\n", (void *)doing_stuff_1);

} // doing_stuff


_another_0 void doing_stuff_2(void)
{

  printf("@%p: Doing some work 2...\n", (void *)doing_stuff_2);

} // doing_stuff

_another_1 void doing_stuff_3(void)
{

  printf("@%p: Doing some work 3...\n", (void *)doing_stuff_3);

} // doing_stuff


int main(int ac, char *av[])
{
  while(1)
  {
    sleep(2);

    doing_stuff_0();
    doing_stuff_1();
    doing_stuff_2();
    doing_stuff_3();

  } // End while

  return 0;

} // main

At the beginning of the source file there is a compilation flag to activate/deactivate the sections : #if 0 ...

The program is an infinite loop which executes 4 functions displaying their address in memory.

3. Compilation without the sections

The beginning of the source file is : #if 0

The custom linker script is passed with the -T option:

$ gcc -o example example.c -Wl,-Tmyld.script

The execution shows that the functions are consecutive in memory :

$ ./example
@0x555c75733169: Doing some work 0...
@0x555c7573318c: Doing some work 1...
@0x555c757331af: Doing some work 2...
@0x555c757331d2: Doing some work 3...
^C
$ objdump -h example
[...]
Sections:
Idx Name          Size      VMA               LMA               File off  Algn
[...]
15 .text         00000225  0000000000001080  0000000000001080  00001080  2**4
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
4. Compilation with the sections

The beginning of the source file is : #if 1

The custom linker script is passed with the -T option:

$ gcc -o example example.c -Wl,-Tmyld.script

The execution shows that the functions are grouped by section (another_0 and another_1) :

$ ./example
@0x563f105ec215: Doing some work 0...
@0x563f105ec25b: Doing some work 1...
@0x563f105ec238: Doing some work 2...
@0x563f105ec27e: Doing some work 3...
^C
$ objdump -h example
[...]
Sections:
Idx Name          Size      VMA               LMA               File off  Algn
[...]
 15 .text         00000195  0000000000001080  0000000000001080  00001080  2**4
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
 16 another_0     00000046  0000000000001215  0000000000001215  00001215  2**0
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
 17 another_1     00000046  000000000000125b  000000000000125b  0000125b  2**0
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
5. The "hot" gcc attribute

The hot attribute is used to inform the compiler that a function is a hot spot of the compiled program. The function is optimized more aggressively and on many target it is placed into special subsection of the text section so all hot functions appears close together improving locality.
Let's replace the another_0 section by the hot attribute to see what happen on the function location:

#if 1

//#define _another_0 __attribute__ ((section ("another_0")))
#define _another_0 __attribute__ ((hot))
//#define _another_1 __attribute__ ((section ("another_1")))
#define _another_1

#else

#define _another_0
#define _another_1

#endif

The custom linker script is no longer needed:

$ gcc -o example example.c -O2
$ ./example 
@0x56274041b100: Doing some work 0...
@0x56274041b230: Doing some work 1...
@0x56274041b120: Doing some work 2...
@0x56274041b250: Doing some work 3...
^C

Among the multiple optimizations that the compiler is supposed to do, the arrangement in memory to make the function closer is also done here.
Apparently, in this example, the functions are not put into a specific section:

$ objdump -h example
[...]
Sections:
Idx Name          Size      VMA               LMA               File off  Algn
[...]
 15 .text         00000265  0000000000001080  0000000000001080  00001080  2**4
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
[...]

The optimization option is needed to take in account the hot attribute otherwise, no function rearrangement is done. For example, when we omit -O2:

$ gcc -o example example.c    
rachid@rachid-pc:~/GLMF/PMU/exemples$ ./example 
@0x559503e7b169: Doing some work 0...
@0x559503e7b18c: Doing some work 1...
@0x559503e7b1af: Doing some work 2...
@0x559503e7b1d2: Doing some work 3...
^C
Conclusion

Thanks to the section attribute of the GCC compiler it is possible to rearrange the location of the functions in memory. This permits to make often called functions closer from each other in order to increase cache locality and reduce icache misses.
To make things easier, using the hot attribute along with its counterpart cold attribute can also be considered.

About the author

The author is an engineer in computer sciences located in France. He can be contacted here.