Monitoring a Multi-Region Video Vault
ViralVidVault serves viral video content from 7 European regions. When the Polish trending page goes down or the Swedish cron job fails, we need to know immediately -- not when a user complains. A custom Go monitor gives us exactly the health visibility we need.
Configuration
package monitor
import "time"
type Target struct {
Name string `json:"name"`
URL string `json:"url"`
Interval time.Duration `json:"interval"`
Timeout time.Duration `json:"timeout"`
ExpectCode int `json:"expect_code"`
ExpectBody string `json:"expect_body"`
Region string `json:"region"`
}
type Result struct {
Name string `json:"name"`
Region string `json:"region"`
Status string `json:"status"`
StatusCode int `json:"status_code"`
Latency time.Duration `json:"latency"`
Error string `json:"error,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
var Targets = []Target{
{Name: "homepage", URL: "https://viralvidvault.com", Interval: 5 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200},
{Name: "viral_pl", URL: "https://viralvidvault.com/?region=PL", Interval: 10 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, Region: "PL"},
{Name: "viral_nl", URL: "https://viralvidvault.com/?region=NL", Interval: 10 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, Region: "NL"},
{Name: "viral_se", URL: "https://viralvidvault.com/?region=SE", Interval: 10 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, Region: "SE"},
{Name: "search_api", URL: "https://viralvidvault.com/search?q=viral&format=json", Interval: 15 * time.Minute, Timeout: 15 * time.Second, ExpectCode: 200, ExpectBody: "results"},
{Name: "sitemap", URL: "https://viralvidvault.com/sitemap.xml", Interval: 30 * time.Minute, Timeout: 10 * time.Second, ExpectCode: 200, ExpectBody: "<urlset"},
}
The Checker Engine
package monitor
import (
"fmt"
"io"
"net/http"
"strings"
"sync"
"time"
)
type Checker struct {
client *http.Client
latest map[string]*Result
history map[string][]Result
mu sync.RWMutex
alerts chan Result
}
func NewChecker() *Checker {
return &Checker{
client: &http.Client{},
latest: make(map[string]*Result),
history: make(map[string][]Result),
alerts: make(chan Result, 100),
}
}
func (c *Checker) Check(t Target) Result {
c.client.Timeout = t.Timeout
start := time.Now()
req, _ := http.NewRequest("GET", t.URL, nil)
req.Header.Set("User-Agent", "ViralVidVault-Monitor/1.0")
resp, err := c.client.Do(req)
latency := time.Since(start)
result := Result{
Name: t.Name,
Region: t.Region,
Latency: latency,
Timestamp: time.Now(),
}
if err != nil {
result.Status = "down"
result.Error = err.Error()
c.record(t.Name, result)
return result
}
defer resp.Body.Close()
result.StatusCode = resp.StatusCode
if t.ExpectCode > 0 && resp.StatusCode != t.ExpectCode {
result.Status = "down"
result.Error = fmt.Sprintf("got %d, expected %d", resp.StatusCode, t.ExpectCode)
c.record(t.Name, result)
return result
}
if t.ExpectBody != "" {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512*1024))
if !strings.Contains(string(body), t.ExpectBody) {
result.Status = "down"
result.Error = "missing expected body content"
c.record(t.Name, result)
return result
}
}
result.Status = "up"
c.record(t.Name, result)
return result
}
func (c *Checker) record(name string, r Result) {
c.mu.Lock()
defer c.mu.Unlock()
// Detect state transitions for alerting
prev := c.latest[name]
if prev != nil && prev.Status == "up" && r.Status == "down" {
c.alerts <- r
}
c.latest[name] = &r
c.history[name] = append(c.history[name], r)
if len(c.history[name]) > 200 {
c.history[name] = c.history[name][100:]
}
}
func (c *Checker) Uptime(name string, window time.Duration) float64 {
c.mu.RLock()
defer c.mu.RUnlock()
cutoff := time.Now().Add(-window)
var total, up int
for _, r := range c.history[name] {
if r.Timestamp.After(cutoff) {
total++
if r.Status == "up" {
up++
}
}
}
if total == 0 {
return 100.0
}
return float64(up) / float64(total) * 100
}
Prometheus Metrics and Alerting
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"time"
"myapp/monitor"
)
func main() {
checker := monitor.NewChecker()
// Background check loop
go func() {
for {
for _, t := range monitor.Targets {
r := checker.Check(t)
if r.Status == "down" {
log.Printf("DOWN: %s (%s) - %s", r.Name, r.Region, r.Error)
}
}
time.Sleep(5 * time.Minute)
}
}()
// Alert consumer
go func() {
for alert := range checker.Alerts() {
log.Printf("ALERT: %s went DOWN at %s - %s",
alert.Name, alert.Timestamp.Format(time.RFC3339), alert.Error)
// Send to Slack, email, etc.
}
}()
// Prometheus metrics
http.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4")
for _, t := range monitor.Targets {
result := checker.Check(t)
up := 0
if result.Status == "up" { up = 1 }
uptime := checker.Uptime(t.Name, 24*time.Hour)
fmt.Fprintf(w, "vvv_endpoint_up{name=%q,region=%q} %d
", t.Name, t.Region, up)
fmt.Fprintf(w, "vvv_endpoint_latency_ms{name=%q} %.0f
", t.Name, float64(result.Latency.Milliseconds()))
fmt.Fprintf(w, "vvv_endpoint_uptime_24h{name=%q} %.2f
", t.Name, uptime)
}
})
// Status JSON endpoint
http.HandleFunc("/status", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
status := map[string]interface{}{}
for _, t := range monitor.Targets {
status[t.Name] = map[string]interface{}{
"status": checker.Latest(t.Name).Status,
"latency": checker.Latest(t.Name).Latency.String(),
"uptime": fmt.Sprintf("%.1f%%", checker.Uptime(t.Name, 24*time.Hour)),
}
}
json.NewEncoder(w).Encode(status)
})
log.Println("Monitor on :9090")
log.Fatal(http.ListenAndServe(":9090", nil))
}
This monitor runs as a single Go binary using under 15MB RAM, checking all ViralVidVault endpoints every 5 minutes. The state transition alerting means we only get notified when something changes from up to down -- no alert fatigue from repeated "still down" messages.
This article is part of the Building ViralVidVault series. Check out ViralVidVault to see these techniques in action.
Top comments (0)