Skip to content

Commands & Recipes

Quick Start (All-in-One Docker)

# Start the entire LGTM stack in one container (dev/testing only)
docker run --name lgtm \
  -p 3000:3000 \
  -p 4317:4317 \
  -p 4318:4318 \
  --rm -ti grafana/otel-lgtm

# With persistent data
docker run --name lgtm \
  -v "$(pwd)/data:/data" \
  -p 3000:3000 \
  -p 4317:4317 \
  -p 4318:4318 \
  grafana/otel-lgtm

# Enable internal component logs
docker run --name lgtm \
  -e ENABLE_LOGS_ALL=true \
  -p 3000:3000 \
  -p 4317:4317 \
  -p 4318:4318 \
  grafana/otel-lgtm

Test it immediately — send a trace with curl:

curl -X POST http://localhost:4318/v1/traces \
  -H "Content-Type: application/json" \
  -d '{
    "resourceSpans": [{
      "resource": {"attributes": [{"key": "service.name", "value": {"stringValue": "test-service"}}]},
      "scopeSpans": [{
        "spans": [{
          "traceId": "5b8efff798038103d269b633813fc60c",
          "spanId": "eee19b7ec3c1b174",
          "name": "test-span",
          "kind": 1,
          "startTimeUnixNano": "1544712660000000000",
          "endTimeUnixNano": "1544712661000000000"
        }]
      }]
    }]
  }'

Grafana Data Source Provisioning (Cross-Signal)

This is the most critical provisioning file for the LGTM stack — it wires up all cross-signal correlation:

# /etc/grafana/provisioning/datasources/lgtm.yaml
apiVersion: 1
datasources:
  # === METRICS (Mimir) ===
  - name: Mimir
    type: prometheus
    uid: mimir
    access: proxy
    url: http://mimir-query-frontend:8080/prometheus
    isDefault: true
    jsonData:
      httpMethod: POST
      exemplarTraceIdDestinations:
        - name: traceID
          datasourceUid: tempo

  # === LOGS (Loki) ===
  - name: Loki
    type: loki
    uid: loki
    access: proxy
    url: http://loki-gateway:3100
    jsonData:
      derivedFields:
        - datasourceUid: tempo
          matcherRegex: '"traceID":"(\w+)"'
          name: TraceID
          url: '$${__value.raw}'
          urlDisplayLabel: 'View Trace'

  # === TRACES (Tempo) ===
  - name: Tempo
    type: tempo
    uid: tempo
    access: proxy
    url: http://tempo-query-frontend:3200
    jsonData:
      tracesToLogsV2:
        datasourceUid: loki
        spanStartTimeShift: '-1h'
        spanEndTimeShift: '1h'
        tags:
          - key: service.name
            value: service_name
        filterByTraceID: true
        filterBySpanID: false
      tracesToMetrics:
        datasourceUid: mimir
        spanStartTimeShift: '-1h'
        spanEndTimeShift: '1h'
        tags:
          - key: service.name
            value: job
        queries:
          - name: 'Request Rate'
            query: 'sum(rate(traces_spanmetrics_calls_total{$$__tags}[5m]))'
          - name: 'Error Rate'
            query: 'sum(rate(traces_spanmetrics_calls_total{$$__tags,status_code="STATUS_CODE_ERROR"}[5m]))'
      tracesToProfiles:
        datasourceUid: pyroscope
        tags:
          - key: service.name
            value: service_name
        profileTypeId: 'process_cpu:cpu:nanoseconds:cpu:nanoseconds'
      serviceMap:
        datasourceUid: mimir
      nodeGraph:
        enabled: true

  # === PROFILES (Pyroscope) ===
  - name: Pyroscope
    type: grafana-pyroscope-datasource
    uid: pyroscope
    access: proxy
    url: http://pyroscope:4040

Alloy Configuration (Full LGTM Pipeline)

// config.alloy — Full LGTM pipeline with all 4 signals

// =============================================
// RECEIVERS
// =============================================

// OTLP receiver for traces and metrics
otelcol.receiver.otlp "default" {
  grpc { endpoint = "0.0.0.0:4317" }
  http { endpoint = "0.0.0.0:4318" }

  output {
    metrics = [otelcol.processor.batch.default.input]
    traces  = [otelcol.processor.batch.default.input]
    logs    = [otelcol.processor.batch.default.input]
  }
}

// Prometheus scrape for Kubernetes pods
prometheus.scrape "k8s_pods" {
  targets    = discovery.kubernetes.pods.targets
  forward_to = [prometheus.remote_write.mimir.receiver]
}

discovery.kubernetes "pods" {
  role = "pod"
}

// =============================================
// PROCESSORS
// =============================================

otelcol.processor.batch "default" {
  timeout = "5s"
  send_batch_size = 8192

  output {
    metrics = [otelcol.processor.memory_limiter.default.input]
    traces  = [otelcol.processor.memory_limiter.default.input]
    logs    = [otelcol.processor.memory_limiter.default.input]
  }
}

otelcol.processor.memory_limiter "default" {
  check_interval = "1s"
  limit_mib      = 512

  output {
    metrics = [otelcol.exporter.prometheus.mimir.input]
    traces  = [otelcol.exporter.otlp.tempo.input]
    logs    = [otelcol.exporter.loki.default.input]
  }
}

// =============================================
// EXPORTERS
// =============================================

// Metrics → Mimir
prometheus.remote_write "mimir" {
  endpoint {
    url = "http://mimir-distributor:8080/api/v1/push"
  }
}

otelcol.exporter.prometheus "mimir" {
  forward_to = [prometheus.remote_write.mimir.receiver]
}

// Traces → Tempo
otelcol.exporter.otlp "tempo" {
  client {
    endpoint = "tempo-distributor:4317"
    tls { insecure = true }
  }
}

// Logs → Loki
otelcol.exporter.loki "default" {
  forward_to = [loki.write.default.receiver]
}

loki.write "default" {
  endpoint {
    url = "http://loki-distributor:3100/loki/api/v1/push"
  }
}

Helm Values Snippets

Mimir (Key Production Settings)

# mimir-values.yaml (key settings only)
mimir:
  structuredConfig:
    common:
      storage:
        backend: s3
        s3:
          endpoint: s3.us-east-1.amazonaws.com
          bucket_name: observability-mimir-blocks
          region: us-east-1
    blocks_storage:
      tsdb:
        retention_period: 13h  # blocks before compaction
      bucket_store:
        sync_interval: 15m
    limits:
      max_global_series_per_user: 1500000
      ingestion_rate: 100000
    ruler_storage:
      backend: s3
      s3:
        bucket_name: observability-mimir-ruler

ingester:
  replicas: 3
  resources:
    requests: { cpu: "1", memory: "4Gi" }
    limits:   { cpu: "2", memory: "8Gi" }
  persistentVolume:
    enabled: true
    size: 50Gi

querier:
  replicas: 2
  resources:
    requests: { cpu: "500m", memory: "2Gi" }

store_gateway:
  replicas: 2

compactor:
  replicas: 1

Loki (Key Production Settings)

# loki-values.yaml
loki:
  auth_enabled: true
  storage:
    type: s3
    s3:
      endpoint: s3.us-east-1.amazonaws.com
      bucketnames: observability-loki-chunks
      region: us-east-1
  schemaConfig:
    configs:
      - from: "2024-01-01"
        store: tsdb
        object_store: s3
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  limits_config:
    retention_period: 720h  # 30 days
    max_global_streams_per_user: 10000
    ingestion_rate_mb: 20
    per_stream_rate_limit: 5MB

ingester:
  replicas: 3

querier:
  replicas: 2

Tempo (Key Production Settings)

# tempo-values.yaml
tempo:
  multitenancyEnabled: true
  storage:
    trace:
      backend: s3
      s3:
        bucket: observability-tempo-traces
        endpoint: s3.us-east-1.amazonaws.com
        region: us-east-1
  metricsGenerator:
    enabled: true
    remoteWriteUrl: "http://mimir-distributor:8080/api/v1/push"
  global_overrides:
    defaults:
      metrics_generator:
        processors: [span-metrics, service-graphs]

ingester:
  replicas: 3

querier:
  replicas: 2

compactor:
  replicas: 1

OpenTelemetry SDK Quickstart

Java (Auto-Instrumentation)

# Download the OTel Java agent
curl -LO https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar

# Run your app with the agent
java -javaagent:opentelemetry-javaagent.jar \
  -Dotel.service.name=my-service \
  -Dotel.exporter.otlp.endpoint=http://alloy:4317 \
  -jar my-app.jar

Python (Auto-Instrumentation)

# Install OTel packages
pip install opentelemetry-distro opentelemetry-exporter-otlp

# Auto-install all detected instrumentation libraries
opentelemetry-bootstrap -a install

# Run your app with auto-instrumentation
OTEL_SERVICE_NAME=my-service \
OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 \
opentelemetry-instrument python app.py

Go (Manual SDK)

// Initialize OTel in your Go app
import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
    "go.opentelemetry.io/otel/sdk/trace"
)

func initTracer() (*trace.TracerProvider, error) {
    exporter, err := otlptracegrpc.New(ctx,
        otlptracegrpc.WithEndpoint("alloy:4317"),
        otlptracegrpc.WithInsecure(),
    )
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exporter),
        trace.WithResource(resource.NewWithAttributes(
            semconv.SchemaURL,
            semconv.ServiceNameKey.String("my-service"),
        )),
    )
    otel.SetTracerProvider(tp)
    return tp, nil
}

Environment Variables (All Languages)

# Universal OTel configuration via env vars
export OTEL_SERVICE_NAME=my-service
export OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
export OTEL_EXPORTER_OTLP_PROTOCOL=grpc
export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=production,k8s.namespace.name=default"
export OTEL_TRACES_SAMPLER=parentbased_traceidratio
export OTEL_TRACES_SAMPLER_ARG=0.1  # 10% sampling

Migration Recipes

Prometheus → Mimir (Add Long-Term Storage)

# Add to existing Prometheus config — zero-downtime migration
remote_write:
  - url: http://mimir-distributor:8080/api/v1/push
    headers:
      X-Scope-OrgID: default

Jaeger → Tempo (Trace Backend Swap)

# Tempo natively accepts Jaeger protocol
# Just re-point your Jaeger agents/collectors to Tempo's endpoint:
# Jaeger Thrift HTTP: tempo-distributor:14268
# Jaeger gRPC:        tempo-distributor:14250
# Or preferably, switch to OTLP: tempo-distributor:4317

Elasticsearch/Kibana → Loki/Grafana (Conceptual)

  1. Deploy Loki alongside Elasticsearch
  2. Configure Alloy to send logs to both Loki AND Elasticsearch (dual-write)
  3. Rebuild critical Kibana dashboards in Grafana using LogQL
  4. Validate data completeness and query parity
  5. Cut over: stop writing to Elasticsearch
  6. Decommission Elasticsearch after retention period expires

Useful One-Liners

# Check LGTM component health
for svc in mimir-distributor loki-distributor tempo-distributor; do
  echo "$svc: $(curl -s http://$svc:8080/ready)"
done

# Query Mimir directly via curl
curl -s -H "X-Scope-OrgID: default" \
  "http://mimir-query-frontend:8080/prometheus/api/v1/query?query=up" | jq .

# Push a test log to Loki
curl -X POST -H "Content-Type: application/json" \
  -H "X-Scope-OrgID: default" \
  "http://loki-distributor:3100/loki/api/v1/push" \
  -d '{"streams":[{"stream":{"app":"test"},"values":[ ["'$(date +%s)000000000'","hello from curl"]]}]}'

# Query Loki directly
curl -s -H "X-Scope-OrgID: default" \
  "http://loki-query-frontend:3100/loki/api/v1/query_range?query={app=\"test\"}&limit=10" | jq .

# Check Tempo trace by ID
curl -s "http://tempo-query-frontend:3200/api/traces/5b8efff798038103d269b633813fc60c" | jq .