DEV Community

ahmet gedik
ahmet gedik

Posted on

Monitoring and Alerting for Video Platform Infrastructure with Go

Monitoring a Multi-Region Video Vault

ViralVidVault serves viral video content from 7 European regions. When the Polish trending page goes down or the Swedish cron job fails, we need to know immediately -- not when a user complains. A custom Go monitor gives us exactly the health visibility we need.

Configuration

package monitor

import "time"

type Target struct {
    Name       string        `json:"name"`
    URL        string        `json:"url"`
    Interval   time.Duration `json:"interval"`
    Timeout    time.Duration `json:"timeout"`
    ExpectCode int           `json:"expect_code"`
    ExpectBody string        `json:"expect_body"`
    Region     string        `json:"region"`
}

type Result struct {
    Name       string        `json:"name"`
    Region     string        `json:"region"`
    Status     string        `json:"status"`
    StatusCode int           `json:"status_code"`
    Latency    time.Duration `json:"latency"`
    Error      string        `json:"error,omitempty"`
    Timestamp  time.Time     `json:"timestamp"`
}

var Targets = []Target{
    {Name: "homepage", URL: "https://viralvidvault.com", Interval: 5 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200},
    {Name: "viral_pl", URL: "https://viralvidvault.com/?region=PL", Interval: 10 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, Region: "PL"},
    {Name: "viral_nl", URL: "https://viralvidvault.com/?region=NL", Interval: 10 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, Region: "NL"},
    {Name: "viral_se", URL: "https://viralvidvault.com/?region=SE", Interval: 10 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, Region: "SE"},
    {Name: "search_api", URL: "https://viralvidvault.com/search?q=viral&format=json", Interval: 15 * time.Minute, Timeout: 15 * time.Second, ExpectCode: 200, ExpectBody: "results"},
    {Name: "sitemap", URL: "https://viralvidvault.com/sitemap.xml", Interval: 30 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, ExpectBody: "<urlset"},
}
Enter fullscreen mode Exit fullscreen mode

The Checker Engine

package monitor

import (
    "fmt"
    "io"
    "net/http"
    "strings"
    "sync"
    "time"
)

type Checker struct {
    client  *http.Client
    latest  map[string]*Result
    history map[string][]Result
    mu      sync.RWMutex
    alerts  chan Result
}

func NewChecker() *Checker {
    return &Checker{
        client:  &http.Client{},
        latest:  make(map[string]*Result),
        history: make(map[string][]Result),
        alerts:  make(chan Result, 100),
    }
}

func (c *Checker) Check(t Target) Result {
    c.client.Timeout = t.Timeout
    start := time.Now()

    req, _ := http.NewRequest("GET", t.URL, nil)
    req.Header.Set("User-Agent", "ViralVidVault-Monitor/1.0")

    resp, err := c.client.Do(req)
    latency := time.Since(start)

    result := Result{
        Name:      t.Name,
        Region:    t.Region,
        Latency:   latency,
        Timestamp: time.Now(),
    }

    if err != nil {
        result.Status = "down"
        result.Error = err.Error()
        c.record(t.Name, result)
        return result
    }
    defer resp.Body.Close()

    result.StatusCode = resp.StatusCode

    if t.ExpectCode > 0 && resp.StatusCode != t.ExpectCode {
        result.Status = "down"
        result.Error = fmt.Sprintf("got %d, expected %d", resp.StatusCode, t.ExpectCode)
        c.record(t.Name, result)
        return result
    }

    if t.ExpectBody != "" {
        body, _ := io.ReadAll(io.LimitReader(resp.Body, 512*1024))
        if !strings.Contains(string(body), t.ExpectBody) {
            result.Status = "down"
            result.Error = "missing expected body content"
            c.record(t.Name, result)
            return result
        }
    }

    result.Status = "up"
    c.record(t.Name, result)
    return result
}

func (c *Checker) record(name string, r Result) {
    c.mu.Lock()
    defer c.mu.Unlock()

    // Detect state transitions for alerting
    prev := c.latest[name]
    if prev != nil && prev.Status == "up" && r.Status == "down" {
        c.alerts <- r
    }

    c.latest[name] = &r
    c.history[name] = append(c.history[name], r)
    if len(c.history[name]) > 200 {
        c.history[name] = c.history[name][100:]
    }
}

func (c *Checker) Uptime(name string, window time.Duration) float64 {
    c.mu.RLock()
    defer c.mu.RUnlock()
    cutoff := time.Now().Add(-window)
    var total, up int
    for _, r := range c.history[name] {
        if r.Timestamp.After(cutoff) {
            total++
            if r.Status == "up" {
                up++
            }
        }
    }
    if total == 0 {
        return 100.0
    }
    return float64(up) / float64(total) * 100
}
Enter fullscreen mode Exit fullscreen mode

Prometheus Metrics and Alerting

package main

import (
    "encoding/json"
    "fmt"
    "log"
    "net/http"
    "time"
    "myapp/monitor"
)

func main() {
    checker := monitor.NewChecker()

    // Background check loop
    go func() {
        for {
            for _, t := range monitor.Targets {
                r := checker.Check(t)
                if r.Status == "down" {
                    log.Printf("DOWN: %s (%s) - %s", r.Name, r.Region, r.Error)
                }
            }
            time.Sleep(5 * time.Minute)
        }
    }()

    // Alert consumer
    go func() {
        for alert := range checker.Alerts() {
            log.Printf("ALERT: %s went DOWN at %s - %s",
                alert.Name, alert.Timestamp.Format(time.RFC3339), alert.Error)
            // Send to Slack, email, etc.
        }
    }()

    // Prometheus metrics
    http.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
        w.Header().Set("Content-Type", "text/plain; version=0.0.4")
        for _, t := range monitor.Targets {
            result := checker.Check(t)
            up := 0
            if result.Status == "up" { up = 1 }
            uptime := checker.Uptime(t.Name, 24*time.Hour)
            fmt.Fprintf(w, "vvv_endpoint_up{name=%q,region=%q} %d
", t.Name, t.Region, up)
            fmt.Fprintf(w, "vvv_endpoint_latency_ms{name=%q} %.0f
", t.Name, float64(result.Latency.Milliseconds()))
            fmt.Fprintf(w, "vvv_endpoint_uptime_24h{name=%q} %.2f
", t.Name, uptime)
        }
    })

    // Status JSON endpoint
    http.HandleFunc("/status", func(w http.ResponseWriter, r *http.Request) {
        w.Header().Set("Content-Type", "application/json")
        status := map[string]interface{}{}
        for _, t := range monitor.Targets {
            status[t.Name] = map[string]interface{}{
                "status":  checker.Latest(t.Name).Status,
                "latency": checker.Latest(t.Name).Latency.String(),
                "uptime":  fmt.Sprintf("%.1f%%", checker.Uptime(t.Name, 24*time.Hour)),
            }
        }
        json.NewEncoder(w).Encode(status)
    })

    log.Println("Monitor on :9090")
    log.Fatal(http.ListenAndServe(":9090", nil))
}
Enter fullscreen mode Exit fullscreen mode

This monitor runs as a single Go binary using under 15MB RAM, checking all ViralVidVault endpoints every 5 minutes. The state transition alerting means we only get notified when something changes from up to down -- no alert fatigue from repeated "still down" messages.


This article is part of the Building ViralVidVault series. Check out ViralVidVault to see these techniques in action.

Top comments (0)