Author: R. Koucha
Last update: 02-Dec-2020
Cleanup of huge page files
Introduction
1 Example of huge pages allocation
2 Cleanup of the backing files
3 Closing of the backing file
References
About the author
Introduction

The huge pages are backed by files located in file systems of type hugetlbfs. When an application terminates, it is supposed to make all the cleanups to remove any allocated resources. If the application maps huge pages, it is supposed to remove the backing files. But under unexpected termination conditions (e.g. receipt of signals), the application may end without removing the backing files.

1 Example of huge pages allocation

The system architecture on which the following program runs, provides 2 MB huge pages. The program mounts a hugetlbfs file system on /tmp/hpfs (the available mount options are described in the kernel documentation), creates a file named memfile, extends it to the desired size to trigger the reservation of the corresponding huge pages and maps the file into its memory space. Then the program writes some data in the memory area before calling pause() to wait for any signal to terminate. The latter will permit to make some observations while the process is active.

#include <sys/types.h>
#include <errno.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>

#define ERR(fmt, ...) do {                            \
    fprintf(stderr,                                   \
            "ERROR@%s#%d: "fmt,                       \
             __FUNCTION__, __LINE__, ## __VA_ARGS__); \
                         } while(0)


#define HP_SIZE   (2 * 1024 * 1024)  // Huge page size = 2 MB
#define HPFS_DIR  "/tmp/hpfs"
#define HPFS_SIZE (4 * HP_SIZE)


int main(void)
{
void *addr;
char  cmd[256];
int   status;
int   rc;
char  mount_opts[256];
int   fd;

  rc = mkdir(HPFS_DIR, 0777);
  if (0 != rc && EEXIST != errno) {
    ERR("mkdir(): %m (%d)\n", errno);
    return 1;
  }

  snprintf(mount_opts, sizeof(mount_opts), "pagesize=%d,size=%d,min_size=%d", HP_SIZE, 2*HP_SIZE, HP_SIZE);

  rc = mount("none", HPFS_DIR, "hugetlbfs", 0, mount_opts);
  if (0 != rc) {
    ERR("mount(): %m (%d)\n", errno);
    return 1;
  }

  fd = open(HPFS_DIR"/memfile", O_RDWR|O_CREAT, 0777);
  if (fd < 0) {
    ERR("open(%s): %m (%d)\n", "memfile_01", errno);
    return 1;
  }

  rc = ftruncate(fd, 2 * HP_SIZE);
  if (0 != rc) {
    ERR("ftruncate(): %m (%d)\n", errno);
    return 1;
  }

  addr = mmap(NULL, 2 * HP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
  if (MAP_FAILED == addr) {
    ERR("mmap(): %m (%d)\n", errno);
    return 1;
  }

  // Write some data in the mapped area
  memset(addr, 0xFF, 2 * HP_SIZE);
  snprintf(addr, 2 * HP_SIZE, "Data written into the huge page area located at virtual address %p...", addr);
  printf("%s\n", (char *)addr);

  // Wait for some signal to end the program
  pause();

  return 0;

} // main

Let's build and run the program:

$ gcc hp_alloc.c -o hp_alloc
$ ./hp_alloc
ERROR@main#44: mount(): Operation not permitted (1)  # Super user rights required for the mount operation
$ sudo ./hp_alloc 
ERROR@main#44: mount(): Cannot allocate memory (12)  # The huge page pool must be configured
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
0
$ sudo sh -c "echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages" # Add 8 huge pages in the pool
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages 
8
$ sudo ./hp_alloc
Data written into the huge page area located at virtual address 0x7ff744600000...

In another terminal, the map of the process shows the name of the backing file and the allocated memory area. As a side note, for historical reasons (cf. kernel documentation), the space occupied by the huge pages are not counted in the Rss/Pss but in the Shared_Hugetlb/Private_Hugetlb fields:

"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
hugetlbfs page which is not counted in "RSS" or "PSS" field for historical
reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.

$ pidof hp_alloc
6282
$ sudo cat /proc/6282/smaps
[...]
7ff744600000-7ff744a00000 rw-p 00000000 00:3d 159202       /tmp/hpfs/memfile
Size:               4096 kB
KernelPageSize:     2048 kB
MMUPageSize:        2048 kB
Rss:                   0 kB
Pss:                   0 kB
Shared_Clean:          0 kB
Shared_Dirty:          0 kB
Private_Clean:         0 kB
Private_Dirty:         0 kB
Referenced:            0 kB
Anonymous:             0 kB
LazyFree:              0 kB
AnonHugePages:         0 kB
ShmemPmdMapped:        0 kB
FilePmdMapped:        0 kB
Shared_Hugetlb:        0 kB
Private_Hugetlb:    4096 kB
Swap:                  0 kB
SwapPss:               0 kB
Locked:                0 kB
THPeligible:		0
VmFlags: rd wr mr mw me de ht sd 
[...]

We can also look at the file created to back the huge pages as well as the consumed huge pages:

$ cat /proc/mounts
[...]
none /tmp/hpfs hugetlbfs rw,relatime,pagesize=2M,size=4194304,min_size=2097152 0 0
$ ls -l /tmp/hpfs/
total 0
-rwxr-xr-x 1 root root 4194304 nov.   30 11:34 memfile
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
8
$ $ cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages 
6     # The programs consumes 2 huge pages

Back to the terminal where the program is running, let's terminate it with a CTRL-C (SIGINT signal). A look in the file system shows that the backing file is still present but the huge pages are put backed in the pool as the terminated program no longer maps them:

$ sudo ./hp_alloc
Data written into the huge page area located at virtual address 0x7ff744600000...
^C
$ ls -l /tmp/hpfs/
total 0
-rwxr-xr-x 1 root root 4194304 nov.   30 11:34 memfile
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages 
8

Don't forget to unmount the file system manually after the termination of the program. Normally the mount procedure is to be done at system startup but for the sake of the simplicity, we made it in the program.

$ sudo umount /tmp/hpfs
2 Cleanup of the backing files

To make the system cleanup the backing files, it is customary to remove the file once there are mapped as a file marked deleted will be effectively deleted once the reference count drops to 0. So, as long as the file is referenced (i.e. opened or mapped), the process can mark it deleted and continue to use it. Upon process termination, the file is automatically closed (upon process exit). This makes the reference counter drop to 0 and trigger the file deletion.

The following program is an enhancement of the first one with the added call to unlink():

#include <sys/types.h>
#include <errno.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>

#define ERR(fmt, ...) do {                            \
    fprintf(stderr,                                   \
            "ERROR@%s#%d: "fmt,                       \
             __FUNCTION__, __LINE__, ## __VA_ARGS__); \
                         } while(0)


#define HP_SIZE   (2 * 1024 * 1024)  // Huge page size
#define HPFS_DIR  "/tmp/hpfs"
#define HPFS_SIZE (4 * HP_SIZE)


int main(void)
{
void *addr;
char  cmd[256];
int   status;
int   rc;
char  mount_opts[256];
int   fd;

  rc = mkdir(HPFS_DIR, 0777);
  if (0 != rc && EEXIST != errno) {
    ERR("mkdir(): %m (%d)\n", errno);
    return 1;
  }

  snprintf(mount_opts, sizeof(mount_opts), "pagesize=%d,size=%d,min_size=%d", HP_SIZE, 2*HP_SIZE, HP_SIZE);

  rc = mount("none", HPFS_DIR, "hugetlbfs", 0, mount_opts);
  if (0 != rc) {
    ERR("mount(): %m (%d)\n", errno);
    return 1;
  }

  fd = open(HPFS_DIR"/memfile", O_RDWR|O_CREAT, 0777);
  if (fd < 0) {
    ERR("open(%s): %m (%d)\n", "memfile_01", errno);
    return 1;
  }

  rc = ftruncate(fd, 2 * HP_SIZE);
  if (0 != rc) {
    ERR("ftruncate(): %m (%d)\n", errno);
    return 1;
  }

  addr = mmap(NULL, 2 * HP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
  if (MAP_FAILED == addr) {
    ERR("mmap(): %m (%d)\n", errno);
    return 1;
  }

  // Unlink the file
  rc = unlink(HPFS_DIR"/memfile");
  if (0 != rc) {
    ERR("unlink(): %m (%d)\n", errno);
    return 1;
  }

  // Write some data in the mapped area
  memset(addr, 0xFF, 2 * HP_SIZE);
  snprintf(addr, 2 * HP_SIZE, "Data written into the huge page area located at virtual address %p...", addr);
  printf("%s\n", (char *)addr);

  // Wait for some signal to end the program
  pause();

  return 0;

} // main

Let's build and run it:

$ gcc hp_alloc2.c -o hp_alloc2
$ sudo ./hp_alloc2
Data written into the huge page area located at virtual address 0x7f9b9ca00000...

In another terminal, we can see that the file is marked deleted in the process map and its name no longer appears in the file system:

$ pidof hp_alloc2
7170
$ sudo cat /proc/7170/smaps
[...]
7f9b9ca00000-7f9b9ce00000 rw-p 00000000 00:3d 192537     /tmp/hpfs/memfile (deleted)
Size:               4096 kB
KernelPageSize:     2048 kB
MMUPageSize:        2048 kB
Rss:                   0 kB
Pss:                   0 kB
Shared_Clean:          0 kB
Shared_Dirty:          0 kB
Private_Clean:         0 kB
Private_Dirty:         0 kB
Referenced:            0 kB
Anonymous:             0 kB
LazyFree:              0 kB
AnonHugePages:         0 kB
ShmemPmdMapped:        0 kB
FilePmdMapped:        0 kB
Shared_Hugetlb:        0 kB
Private_Hugetlb:    4096 kB
Swap:                  0 kB
SwapPss:               0 kB
Locked:                0 kB
THPeligible:		0
VmFlags: rd wr mr mw me de ht sd
[...]
$ ls -l /tmp/hpfs
total 0
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages
6

Back into the terminal where the program is running, we terminate it with CTRL-C (SIGINT signal). This triggers the deallocation of the huge pages and the effective deletion of the backing file:

$ sudo ./hp_alloc2
Data written into the huge page area located at virtual address 0x7f9b9ca00000...
^C
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages
8
$ ls -l /tmp/hpfs/
total 0
$ sudo umount /tmp/hpfs

Removing the files to trigger an implicit cleanup upon termination of the processes is widely used. The kernel does the same for the huge pages allocated in the hidden mounted hugetlbfs file systems as explained in this article.

3 Closing of the backing file

In some applications the number of huge pages is so big that this may lead to numerous open files that could be prejudiciable to services needing file descriptors (e.g. accept() system call which creates a socket upon input connections). The EMFILE ("Too many open files") error may be raised.

The manual of mmap() specifies that "the file descriptor, can be closed immediately without invalidating the mapping". Hence the following enhanced version of our example program which closes the file right after the mapping:

#include <sys/types.h>
#include <errno.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>


#define ERR(fmt, ...) do {                            \
    fprintf(stderr,                                   \
            "ERROR@%s#%d: "fmt,                       \
             __FUNCTION__, __LINE__, ## __VA_ARGS__); \
                         } while(0)


#define HP_SIZE   (2 * 1024 * 1024)  // Huge page size
#define HPFS_DIR  "/tmp/hpfs"
#define HPFS_SIZE (4 * HP_SIZE)


int main(void)
{
void *addr;
char  cmd[256];
int   status;
int   rc;
char  mount_opts[256];
int   fd;

  rc = mkdir(HPFS_DIR, 0777);
  if (0 != rc && EEXIST != errno) {
    ERR("mkdir(): %m (%d)\n", errno);
    return 1;
  }

  snprintf(mount_opts, sizeof(mount_opts), "pagesize=%d,size=%d,min_size=%d", HP_SIZE, 2*HP_SIZE, HP_SIZE);

  rc = mount("none", HPFS_DIR, "hugetlbfs", 0, mount_opts);
  if (0 != rc) {
    ERR("mount(): %m (%d)\n", errno);
    return 1;
  }

  fd = open(HPFS_DIR"/memfile", O_RDWR|O_CREAT, 0777);
  if (fd < 0) {
    ERR("open(%s): %m (%d)\n", "memfile_01", errno);
    return 1;
  }

  rc = ftruncate(fd, 2 * HP_SIZE);
  if (0 != rc) {
    ERR("ftruncate(): %m (%d)\n", errno);
    return 1;
  }

  addr = mmap(NULL, 2 * HP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
  if (MAP_FAILED == addr) {
    ERR("mmap(): %m (%d)\n", errno);
    return 1;
  }

  // Close the file
  rc = close(fd);
  if (rc != 0) {
    ERR("close(%d): %m (%d)\n", fd, errno);
    return 1;
  }

  // Unlink the file
  rc = unlink(HPFS_DIR"/memfile");
  if (0 != rc) {
    ERR("unlink(): %m (%d)\n", errno);
    return 1;
  }

  // Write some data in the mapped area
  memset(addr, 0xFF, 2 * HP_SIZE);
  snprintf(addr, 2 * HP_SIZE, "Data written into the huge page area located at virtual address %p...", addr);
  printf("%s\n", (char *)addr);

  // Wait for some signal to end the program
  pause();

  return 0;

} // main
References
About the author

The author is an engineer in computer sciences located in France. He can be contacted here.