diff --git a/.gitignore b/.gitignore index ba678ac..dda1ec8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.env .env +.a0proj/ diff --git a/OTEL_INTEGRATION_NOTES.md b/OTEL_INTEGRATION_NOTES.md new file mode 100644 index 0000000..9c85048 --- /dev/null +++ b/OTEL_INTEGRATION_NOTES.md @@ -0,0 +1,114 @@ +# OTEL / Observability Integration Notes + +> **Last updated**: 2026-06-15 +> **Author**: Agent Zero analysis +> **Scope**: All `curated_compose` stacks + +--- + +## TL;DR + +- **LGTM** is the central OTEL backend (traces, metrics, logs via Grafana/Tempo/Loki/Prometheus). +- **n8n** → LGTM directly (✅ working). +- **Langfuse** → LGTM (Langfuse's own self-traces, ✅ working). +- **Headroom** → Langfuse (intentional — LLM-specific observability). +- **Chroma** → ❌ not wired (env vars exist but compose ignores them). +- **Dify** → ❌ no OTEL support yet. + +--- + +## Stack-by-Stack Telemetry Status + +| Stack | Sends to LGTM? | Sends to Langfuse? | Configured? | Notes | +|-------|---------------|-------------------|-------------|-------| +| **LGTM** | — | — | ✅ | Receives OTLP gRPC `:4317`, HTTP `:4318` | +| **n8n** | ✅ Yes | ❌ No | ✅ Active | `OTEL_EXPORTER_OTLP_ENDPOINT=http://lgtm:4318` on main + worker | +| **Langfuse** | ✅ Yes | — | ✅ Active | Own traces to LGTM; stack at `docker/langfuse/compose.yaml` | +| **Headroom** | ❌ No | ✅ Yes | ✅ Active | `OTEL_EXPORTER_OTLP_ENDPOINT=http://langfuse-web:3000/api/public/otel/v1` | +| **Chroma** | ❌ No | ❌ No | ❌ Not wired | `.env.example` has `CHROMA_OPEN_TELEMETRY__ENDPOINT`, compose ignores it | +| **Dify** | ❌ No | ❌ No | ❌ None | No OTEL env vars in compose or `.env.example` | + +--- + +## Architecture + +``` +Headroom Proxy ──OTEL──→ Langfuse ──OTEL──→ LGTM + │ + └── ClickHouse (analytics) + └── Postgres (metadata) + └── Redis (queues) + └── MinIO (S3 storage) + +n8n (main + worker) ──OTEL──→ LGTM + +[Chroma] ──❌──→ LGTM +[Dify] ──❌──→ LGTM +``` + +### Why Headroom → Langfuse (not direct to LGTM)? + +Langfuse is purpose-built for LLM observability — it tracks cost per token, prompt versions, user attribution, and LLM-specific metrics that Tempo/Grafana don't natively understand. Headroom's traces are most valuable inside Langfuse. + +Langfuse then exports its own internal traces to LGTM for infrastructure-wide correlation. + +--- + +## Known Issues / Action Items + +### 🔴 Chroma — OTEL Not Wired + +**Problem**: `docker/chroma/.env.example` defines: +``` +CHROMA_OPEN_TELEMETRY__ENDPOINT= +CHROMA_OPEN_TELEMETRY__SERVICE_NAME=chromadb +OTEL_EXPORTER_OTLP_HEADERS= +``` + +But `docker/chroma/compose.yaml` does **not** pass these env vars into the `chroma` service. + +**Fix**: Add to `compose.yaml` service `environment:`: +```yaml +CHROMA_OPEN_TELEMETRY__ENDPOINT: ${CHROMA_OPEN_TELEMETRY__ENDPOINT:-http://lgtm:4318} +CHROMA_OPEN_TELEMETRY__SERVICE_NAME: ${CHROMA_OPEN_TELEMETRY__SERVICE_NAME:-chromadb} +OTEL_EXPORTER_OTLP_HEADERS: ${OTEL_EXPORTER_OTLP_HEADERS:-} +``` + +### 🟡 Dify — No OTEL Support + +**Problem**: Dify doesn't expose OTEL configuration natively. It's Python/Flask-based but there's no auto-instrumentation or manual instrumentation in the current compose. + +**Recommendation**: Wait for upstream Dify to add native OTEL support. Do not create custom patches per SKILL.md conventions. + +### 🟢 Langfuse Network Verification + +**Status**: Headroom joins external network `langfuse` with `name: langfuse_langfuse`. This is auto-created by Docker Compose from the `docker/langfuse/` directory. **This should work** on deployment. + +**Verify after deploy**: `docker network inspect langfuse_langfuse` should show both `langfuse-web` and `headroom-proxy` containers. + +### 🟡 Unified Log Collection + +All stacks emit container logs. For collecting these into LGTM/Loki: + +- **Option A** (simplest): Configure Docker daemon with Loki log driver globally on Unraid. +- **Option B** (per-stack): Add Promtail sidecar to each compose. + +**Recommendation**: Option A — configure once at the Docker daemon level. + +--- + +## Files Referenced + +| File | Purpose | +|------|---------| +| `docker/chroma/compose.yaml` | Chroma vector DB stack | +| `docker/chroma/.env.example` | Chroma config (OTEL vars present) | +| `docker/dify/docker-compose.yaml` | Dify LLM platform | +| `docker/dify/.env.example` | Dify config (no OTEL vars) | +| `docker/headroom/compose.yaml` | Headroom LLM proxy | +| `docker/langfuse/compose.yaml` | Langfuse observability | +| `docker/lgtm/docker-compose.yaml` | LGTM (OTEL backend) | +| `docker/lgtm/.env.example` | LGTM config | +| `docker/n8n/docker-compose.yaml` | n8n automation | +| `docker/n8n/.env.example` | n8n config (OTEL vars present) | +| `SKILL.md` | Homelab conventions and design rules | diff --git a/docker/chroma/compose.yaml b/docker/chroma/compose.yaml index 68785c3..f0499e6 100644 --- a/docker/chroma/compose.yaml +++ b/docker/chroma/compose.yaml @@ -41,6 +41,9 @@ services: # Telemetry ANONYMIZED_TELEMETRY: ${ANONYMIZED_TELEMETRY:-false} + CHROMA_OPEN_TELEMETRY__ENDPOINT: ${CHROMA_OPEN_TELEMETRY__ENDPOINT:-http://lgtm:4318} + CHROMA_OPEN_TELEMETRY__SERVICE_NAME: ${CHROMA_OPEN_TELEMETRY__SERVICE_NAME:-chromadb} + OTEL_EXPORTER_OTLP_HEADERS: ${OTEL_EXPORTER_OTLP_HEADERS:-} # Migrations MIGRATIONS: ${MIGRATIONS:-apply} diff --git a/docker/headroom/.env.example b/docker/headroom/.env.example new file mode 100644 index 0000000..7914eaa --- /dev/null +++ b/docker/headroom/.env.example @@ -0,0 +1,37 @@ +# ============================================================================= +# Headroom stack environment +# ============================================================================= +# Copy to .env and edit values for your deployment. +# cp .env.example .env +# The actual .env is deployed by Dockhand and should not be committed. +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Image / Runtime +# ----------------------------------------------------------------------------- +HEADROOM_IMAGE=ghcr.io/chopratejas/headroom:latest +HEADROOM_PORT=8787 +HEADROOM_TELEMETRY=off +HEADROOM_LOG_LEVEL=INFO + +# OpenAI-compatible upstream target (Venice.ai default) +OPENAI_TARGET_API_URL=https://api.venice.ai/v1 + +# ----------------------------------------------------------------------------- +# Host paths +# ----------------------------------------------------------------------------- +# Required: host home path used to persist ~/.headroom and agent config mounts. +# Example on Unraid: +# HEADROOM_HOST_HOME=/mnt/user/appdata/headroom +HEADROOM_HOST_HOME=/mnt/user/appdata/headroom + +# Optional: workspace path used by the CLI profile container. +# Keep '.' to use the current compose directory context. +HEADROOM_WORKSPACE=. + +# ----------------------------------------------------------------------------- +# Langfuse tracing keys (used by headroom proxy) +# ----------------------------------------------------------------------------- +# Create these in Langfuse project settings. +LANGFUSE_PUBLIC_KEY=change-me +LANGFUSE_SECRET_KEY=change-me diff --git a/docker/langfuse/.env.example b/docker/langfuse/.env.example new file mode 100644 index 0000000..7714745 --- /dev/null +++ b/docker/langfuse/.env.example @@ -0,0 +1,126 @@ +# ============================================================================= +# Langfuse stack environment +# ============================================================================= +# Copy to .env and edit for your deployment. +# cp .env.example .env +# The actual .env is deployed by Dockhand and should not be committed. +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Core application +# ----------------------------------------------------------------------------- +# Public URL for the Langfuse web app +NEXTAUTH_URL=https://langfuse.ld50.xyz +NEXTAUTH_SECRET=change-me-nextauth-secret + +# Required cryptographic values +# Generate with: +# openssl rand -hex 32 +SALT=change-me-32-char-random-salt +ENCRYPTION_KEY=change-me-64-hex-characters + +# Optional telemetry/feature flags +TELEMETRY_ENABLED=true +LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES=false + +# Optional bootstrap (leave empty to skip auto init) +LANGFUSE_INIT_ORG_ID= +LANGFUSE_INIT_ORG_NAME= +LANGFUSE_INIT_PROJECT_ID= +LANGFUSE_INIT_PROJECT_NAME= +LANGFUSE_INIT_PROJECT_PUBLIC_KEY= +LANGFUSE_INIT_PROJECT_SECRET_KEY= +LANGFUSE_INIT_USER_EMAIL= +LANGFUSE_INIT_USER_NAME= +LANGFUSE_INIT_USER_PASSWORD= + +# ----------------------------------------------------------------------------- +# PostgreSQL +# ----------------------------------------------------------------------------- +POSTGRES_VERSION=17 +POSTGRES_USER=postgres +POSTGRES_PASSWORD=change-me-postgres-password +POSTGRES_DB=postgres + +# Langfuse DB connection (must match Postgres settings above) +DATABASE_URL=postgresql://postgres:change-me-postgres-password@postgres:5432/postgres + +# ----------------------------------------------------------------------------- +# ClickHouse +# ----------------------------------------------------------------------------- +CLICKHOUSE_USER=clickhouse +CLICKHOUSE_PASSWORD=change-me-clickhouse-password +CLICKHOUSE_URL=http://clickhouse:8123 +CLICKHOUSE_MIGRATION_URL=clickhouse://clickhouse:9000 +CLICKHOUSE_CLUSTER_ENABLED=false + +# ----------------------------------------------------------------------------- +# Redis +# ----------------------------------------------------------------------------- +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_AUTH=change-me-redis-password +REDIS_TLS_ENABLED=false +REDIS_TLS_CA=/certs/ca.crt +REDIS_TLS_CERT=/certs/redis.crt +REDIS_TLS_KEY=/certs/redis.key + +# ----------------------------------------------------------------------------- +# MinIO / S3-compatible object storage +# ----------------------------------------------------------------------------- +MINIO_ROOT_USER=minio +MINIO_ROOT_PASSWORD=change-me-minio-password + +# Event uploads +LANGFUSE_S3_EVENT_UPLOAD_BUCKET=langfuse +LANGFUSE_S3_EVENT_UPLOAD_REGION=auto +LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID=minio +LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY=change-me-minio-password +LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT=http://minio:9000 +LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE=true +LANGFUSE_S3_EVENT_UPLOAD_PREFIX=events/ + +# Media uploads +LANGFUSE_S3_MEDIA_UPLOAD_BUCKET=langfuse +LANGFUSE_S3_MEDIA_UPLOAD_REGION=auto +LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID=minio +LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY=change-me-minio-password +LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT=http://localhost:9090 +LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE=true +LANGFUSE_S3_MEDIA_UPLOAD_PREFIX=media/ + +# Batch exports +LANGFUSE_S3_BATCH_EXPORT_ENABLED=false +LANGFUSE_S3_BATCH_EXPORT_BUCKET=langfuse +LANGFUSE_S3_BATCH_EXPORT_PREFIX=exports/ +LANGFUSE_S3_BATCH_EXPORT_REGION=auto +LANGFUSE_S3_BATCH_EXPORT_ENDPOINT=http://minio:9000 +LANGFUSE_S3_BATCH_EXPORT_EXTERNAL_ENDPOINT=http://localhost:9090 +LANGFUSE_S3_BATCH_EXPORT_ACCESS_KEY_ID=minio +LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY=change-me-minio-password +LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE=true + +# Optional cloud storage backends +LANGFUSE_USE_AZURE_BLOB=false +LANGFUSE_USE_OCI_NATIVE_OBJECT_STORAGE=false +LANGFUSE_OCI_AUTH_TYPE=workload_identity + +# ----------------------------------------------------------------------------- +# Ingestion tuning (optional) +# ----------------------------------------------------------------------------- +LANGFUSE_INGESTION_QUEUE_DELAY_MS= +LANGFUSE_INGESTION_CLICKHOUSE_WRITE_INTERVAL_MS= + +# ----------------------------------------------------------------------------- +# OpenTelemetry (Langfuse self-observability) +# ----------------------------------------------------------------------------- +OTEL_EXPORTER_OTLP_ENDPOINT=http://lgtm:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf +OTEL_SERVICE_NAME=langfuse +OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production + +# ----------------------------------------------------------------------------- +# Optional email +# ----------------------------------------------------------------------------- +EMAIL_FROM_ADDRESS= +SMTP_CONNECTION_URL= diff --git a/docker/zitadel/compose.yaml b/docker/zitadel/compose.yaml new file mode 100644 index 0000000..abe14c0 --- /dev/null +++ b/docker/zitadel/compose.yaml @@ -0,0 +1,167 @@ +name: zitadel + +services: + zitadel-api: + image: ghcr.io/zitadel/zitadel:${ZITADEL_VERSION} + restart: unless-stopped + user: "0" + command: start-from-init --masterkey "${ZITADEL_MASTERKEY}" + environment: + ZITADEL_PORT: 8080 + ZITADEL_EXTERNALDOMAIN: ${ZITADEL_DOMAIN} + ZITADEL_EXTERNALPORT: ${ZITADEL_EXTERNALPORT} + ZITADEL_EXTERNALSECURE: ${ZITADEL_EXTERNALSECURE} + ZITADEL_TLS_ENABLED: false + + ZITADEL_DATABASE_POSTGRES_DSN: ${ZITADEL_DATABASE_POSTGRES_DSN} + + ZITADEL_FIRSTINSTANCE_ORG_HUMAN_PASSWORDCHANGEREQUIRED: false + ZITADEL_FIRSTINSTANCE_LOGINCLIENTPATPATH: /zitadel/bootstrap/login-client.pat + ZITADEL_FIRSTINSTANCE_ORG_LOGINCLIENT_MACHINE_USERNAME: login-client + ZITADEL_FIRSTINSTANCE_ORG_LOGINCLIENT_MACHINE_NAME: Automatically Initialized IAM_LOGIN_CLIENT + ZITADEL_FIRSTINSTANCE_ORG_LOGINCLIENT_PAT_EXPIRATIONDATE: ${LOGIN_CLIENT_PAT_EXPIRATION} + + ZITADEL_DEFAULTINSTANCE_FEATURES_LOGINV2_REQUIRED: true + ZITADEL_DEFAULTINSTANCE_FEATURES_LOGINV2_BASEURI: ${ZITADEL_PUBLIC_SCHEME}://${ZITADEL_DOMAIN}/ui/v2/login/ + ZITADEL_OIDC_DEFAULTLOGINURLV2: ${ZITADEL_PUBLIC_SCHEME}://${ZITADEL_DOMAIN}/ui/v2/login/login?authRequest= + ZITADEL_OIDC_DEFAULTLOGOUTURLV2: ${ZITADEL_PUBLIC_SCHEME}://${ZITADEL_DOMAIN}/ui/v2/login/logout?post_logout_redirect= + ZITADEL_SAML_DEFAULTLOGINURLV2: ${ZITADEL_PUBLIC_SCHEME}://${ZITADEL_DOMAIN}/ui/v2/login/login?samlRequest= + + ZITADEL_LOGSTORE_ACCESS_STDOUT_ENABLED: ${ZITADEL_ACCESS_LOG_STDOUT_ENABLED} + + ZITADEL_INSTRUMENTATION_TRACE_EXPORTER_TYPE: ${ZITADEL_INSTRUMENTATION_TRACE_EXPORTER_TYPE:-otlp} + ZITADEL_INSTRUMENTATION_TRACE_EXPORTER_ENDPOINT: ${ZITADEL_INSTRUMENTATION_TRACE_EXPORTER_ENDPOINT:-http://lgtm:4318} + ZITADEL_INSTRUMENTATION_TRACE_EXPORTER_INSECURE: ${ZITADEL_INSTRUMENTATION_TRACE_EXPORTER_INSECURE:-true} + ZITADEL_INSTRUMENTATION_SERVICENAME: ${ZITADEL_INSTRUMENTATION_SERVICENAME:-zitadel} + + ZITADEL_CACHES_CONNECTORS_REDIS_ENABLED: ${ZITADEL_CACHES_CONNECTORS_REDIS_ENABLED} + ZITADEL_CACHES_CONNECTORS_REDIS_URL: ${ZITADEL_CACHES_CONNECTORS_REDIS_URL} + ZITADEL_CACHES_INSTANCE_CONNECTOR: ${ZITADEL_CACHES_INSTANCE_CONNECTOR} + ZITADEL_CACHES_MILESTONES_CONNECTOR: ${ZITADEL_CACHES_MILESTONES_CONNECTOR} + ZITADEL_CACHES_ORGANIZATION_CONNECTOR: ${ZITADEL_CACHES_ORGANIZATION_CONNECTOR} + + healthcheck: + test: + - CMD + - /app/zitadel + - ready + interval: 10s + timeout: 30s + retries: 12 + start_period: 20s + volumes: + - zitadel-bootstrap:/zitadel/bootstrap:rw + networks: + zitadel: {} + swag: + aliases: + - zitadel-api + pipeline: + aliases: + - zitadel-api + depends_on: + postgres: + condition: service_healthy + + zitadel-login: + image: ghcr.io/zitadel/zitadel-login:${ZITADEL_VERSION} + restart: unless-stopped + user: "0" + environment: + ZITADEL_API_URL: http://zitadel-api:8080 + NEXT_PUBLIC_BASE_PATH: /ui/v2/login + ZITADEL_SERVICE_USER_TOKEN_FILE: /zitadel/bootstrap/login-client.pat + CUSTOM_REQUEST_HEADERS: Host:${ZITADEL_DOMAIN},X-Forwarded-Proto:${ZITADEL_PUBLIC_SCHEME} + + # OpenTelemetry — export traces to the otel-lgtm pipeline + OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-zitadel-login} + OTEL_EXPORTER_OTLP_ENDPOINT: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://lgtm:4318} + OTEL_EXPORTER_OTLP_PROTOCOL: ${OTEL_EXPORTER_OTLP_PROTOCOL:-http/protobuf} + healthcheck: + test: + - CMD + - /bin/sh + - -c + - node /app/healthcheck.mjs http://localhost:3000/ui/v2/login/healthy + interval: 10s + timeout: 30s + retries: 12 + start_period: 20s + volumes: + - zitadel-bootstrap:/zitadel/bootstrap:ro + networks: + zitadel: {} + swag: + aliases: + - zitadel-login + pipeline: + aliases: + - zitadel-login + depends_on: + zitadel-api: + condition: service_healthy + + postgres: + image: ${POSTGRES_IMAGE} + restart: unless-stopped + environment: + POSTGRES_PASSWORD: ${POSTGRES_ADMIN_PASSWORD} + POSTGRES_USER: ${POSTGRES_ADMIN_USER} + POSTGRES_DB: ${POSTGRES_DB} + healthcheck: + test: + - CMD-SHELL + - pg_isready -d ${POSTGRES_DB} -U ${POSTGRES_ADMIN_USER} + interval: 10s + timeout: 30s + retries: 10 + start_period: 20s + volumes: + - postgres-data:/var/lib/postgresql/data:rw + networks: + zitadel: {} + swag: + aliases: + - zitadel-login + + redis: + image: ${REDIS_IMAGE} + restart: unless-stopped + profiles: + - cache + command: + - --save + - "" + - --appendonly + - "no" + networks: + - zitadel + + otel-collector: + image: ${OTEL_COLLECTOR_IMAGE} + restart: unless-stopped + profiles: + - observability + command: + - --config=/etc/otelcol/config.yaml + volumes: + - ./otel-collector-config.yaml:/etc/otelcol/config.yaml:ro + networks: + - zitadel + +networks: + # Internal network — postgres, redis, otel-collector not exposed to SWAG. + zitadel: + name: zitadel + # External network created by SWAG. Both zitadel-api and zitadel-login join it + # so SWAG's nginx can reach them by container name without publishing host ports. + swag: + name: ${SWAG_NETWORK} + external: true + pipeline: + name: pipeline + external: true + +volumes: + postgres-data: + zitadel-bootstrap: