Debug School

Cover image for Fix Prometheus WAL dir size issue
Suyash Sambhare
Suyash Sambhare

Posted on • Updated on

Fix Prometheus WAL dir size issue

On Prometheus v2.18 there is an issue that the WAL keeps on growing indefinitely and consuming disk space.
Root cause is that the WAL folder is not getting cleaned by default.
If you restart Prometheus, it tries to load the entire WAL into memory.

So, for example WAL is 61GB, and memory is 32GB, so Prometheus keeps on restarting when it gets killed by the OOM, as it consumes the whole server memory of 24 GB.

Config details:

   - '--web.enable-admin-api'
   - '--config.file=/etc/prometheus/prometheus.yml'
   - '--web.external-url=https://prometheus.example.com'
   - '--storage.tsdb.path=/var/lib/prometheus'
   - '--storage.tsdb.retention=150d'
   - '--web.console.libraries=/usr/share/prometheus/console_libraries'
   - '--web.console.templates=/usr/share/prometheus/consoles'
Enter fullscreen mode Exit fullscreen mode

You can remove the wal metrics by running:
# rm -rf wal/*
# rm -rf chuncks_head/*

Beas

Below script can resolve the issue:

apiVersion: v1
kind: ConfigMap
metadata:
  name: server-code
data:
  sam.go: |
    package main

    import (
        "fmt"
        "log"
        "net/http"
        "os"
        "os/exec"
        "strconv"
        "strings"
        "time"
    )

    func delete() {
        folder := []string{"/data/wal", "/data/chunks_head"}
        for i := 0; i < len(folder); i++ {
            e := os.RemoveAll(folder[i])
            fmt.Println(" Removed ", e)
        }
          }
    func create() {
        folder := []string{"/data/wal", "/data/chunks_head"}
        for i := 0; i < len(folder); i++ {
                    _, e := os.Stat(folder[i])
                    if e != nil {
            err1 := os.MkdirAll(folder[i], os.FileMode(0777))
            err := os.Chown(folder[i], 65534, 65534)
            if err != nil || err1 != nil {
                log.Println(err, err1)
                   }
                   }

           }
               _, err_fi := os.Stat("/data/queries.active")
              if os.IsNotExist(err_fi) {
                     fmt.Println("Creating /data/queries.active ")
                     emptyFile, err := os.Create("/data/queries.active")
                     if err != nil {
                         log.Fatal(err)
                         }
                     err_f := os.Chown("/data/queries.active", 65534, 65534)
                     if err_f != nil {
                          log.Println("Ffile is created")
                          emptyFile.Close()
                       }
                   }
              }
    func main() {
        for {
            time.Sleep(1 * time.Second)
            out, err := exec.Command("du", "-sk", "/data/wal").Output()
            if err == nil {
                d := strings.Fields(string(out))[0]
                f := strings.Replace(d, "K", "", 1)
                if f1, e := strconv.Atoi(f); f1 > 5242880 && e == nil {
                    delete()
                                    create()

                } else {
                    fmt.Println("Size is less "+d+" ==>  %q", (time.Now()))
                }

                url := "http://localhost:9090/graph"

                req, _ := http.NewRequest("GET", url, nil)

                res, _ := http.DefaultClient.Do(req)

                                 if res == nil {
                              delete()
                                              create()
                             }



            } else {
                fmt.Printf("Folder %q is not exists  ==>  %q"+"\n", ("/data/wal"), (time.Now()))
                            create()
            }

        }

    }

Enter fullscreen mode Exit fullscreen mode

Ref: https://stackoverflow.com/questions/63958695/prometheus-wal-keeps-on-growing-indefinitely

Top comments (0)