On Prometheus v2.18 there is an issue that the WAL keeps on growing indefinitely and consuming disk space.
Root cause is that the WAL folder is not getting cleaned by default.
If you restart Prometheus, it tries to load the entire WAL into memory.
So, for example WAL is 61GB, and memory is 32GB, so Prometheus keeps on restarting when it gets killed by the OOM, as it consumes the whole server memory of 24 GB.
Config details:
- '--web.enable-admin-api'
- '--config.file=/etc/prometheus/prometheus.yml'
- '--web.external-url=https://prometheus.example.com'
- '--storage.tsdb.path=/var/lib/prometheus'
- '--storage.tsdb.retention=150d'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
You can remove the wal metrics by running:
# rm -rf wal/*
# rm -rf chuncks_head/*
Below script can resolve the issue:
apiVersion: v1
kind: ConfigMap
metadata:
name: server-code
data:
sam.go: |
package main
import (
"fmt"
"log"
"net/http"
"os"
"os/exec"
"strconv"
"strings"
"time"
)
func delete() {
folder := []string{"/data/wal", "/data/chunks_head"}
for i := 0; i < len(folder); i++ {
e := os.RemoveAll(folder[i])
fmt.Println(" Removed ", e)
}
}
func create() {
folder := []string{"/data/wal", "/data/chunks_head"}
for i := 0; i < len(folder); i++ {
_, e := os.Stat(folder[i])
if e != nil {
err1 := os.MkdirAll(folder[i], os.FileMode(0777))
err := os.Chown(folder[i], 65534, 65534)
if err != nil || err1 != nil {
log.Println(err, err1)
}
}
}
_, err_fi := os.Stat("/data/queries.active")
if os.IsNotExist(err_fi) {
fmt.Println("Creating /data/queries.active ")
emptyFile, err := os.Create("/data/queries.active")
if err != nil {
log.Fatal(err)
}
err_f := os.Chown("/data/queries.active", 65534, 65534)
if err_f != nil {
log.Println("Ffile is created")
emptyFile.Close()
}
}
}
func main() {
for {
time.Sleep(1 * time.Second)
out, err := exec.Command("du", "-sk", "/data/wal").Output()
if err == nil {
d := strings.Fields(string(out))[0]
f := strings.Replace(d, "K", "", 1)
if f1, e := strconv.Atoi(f); f1 > 5242880 && e == nil {
delete()
create()
} else {
fmt.Println("Size is less "+d+" ==> %q", (time.Now()))
}
url := "http://localhost:9090/graph"
req, _ := http.NewRequest("GET", url, nil)
res, _ := http.DefaultClient.Do(req)
if res == nil {
delete()
create()
}
} else {
fmt.Printf("Folder %q is not exists ==> %q"+"\n", ("/data/wal"), (time.Now()))
create()
}
}
}
Ref: https://stackoverflow.com/questions/63958695/prometheus-wal-keeps-on-growing-indefinitely
Top comments (0)