pkmntrade.club/server/gatus/config.template.yaml
badbl0cks 6aa15d1af9
feat: Implement dynamic Gatekeeper proxy and enhance service health monitoring
- **Implemented Dynamic Gatekeeper (Anubis) Proxy:**
  - Introduced Anubis as a Gatekeeper proxy layer for services (`web`, `web-staging`, `feedback`, `health`).
  - Added `docker-gen` setup (`docker-compose_gatekeeper.template.yml`, `gatekeeper-manager`) to dynamically configure Anubis instances based on container labels (`enable_gatekeeper=true`).
  - Updated HAProxy to route traffic through the respective Gatekeeper services.

- **Enhanced Service Health Monitoring & Checks:**
  - Integrated `django-health-check` into the Django application, providing detailed health endpoints (e.g., `/health/`).
  - Replaced the custom health check view with `django-health-check` URLs.
  - Added `psutil` for system metrics in health checks.
  - Made Gatus configuration dynamic using `docker-gen` (`config.template.yaml`), allowing automatic discovery and monitoring of service instances (e.g., web workers).
  - Externalized Gatus SMTP credentials to environment variables.
  - Strengthened `docker-compose_core.yml` with a combined `db-redis-healthcheck` service reporting to Gatus.
  - Added explicit health checks for `db` and `redis` services in `docker-compose.yml`.

- **Improved Docker & Compose Configuration:**
  - Added `depends_on` conditions in `docker-compose.yml` for `web` and `celery` services to wait for the database.
  - Updated `ALLOWED_HOSTS` in `docker-compose_staging.yml` and `docker-compose_web.yml` to include internal container names for Gatekeeper communication.
  - Set `DEBUG=False` for staging services.
  - Removed `.env.production` from `.gitignore` (standardized to `.env`).
  - Streamlined `scripts/entrypoint.sh` by removing the call to the no-longer-present `/deploy.sh`.

- **Dependency Updates:**
  - Added `django-health-check>=3.18.3` and `psutil>=7.0.0` to `pyproject.toml` and `uv.lock`.
  - Updated `settings.py` to include `health_check` apps, configuration, and use `REDIS_URL` consistently.

- **Streamlined deployment script used in GHA:**
  - Updated the workflow to copy new server files and create a new `.env` file in the temporary directory before moving them into place.
  - Consolidated the stopping and removal of old containers into a single step for better clarity and efficiency.
  - Reduce container downtime by rearranging stop/start steps.
2025-05-23 00:15:19 -07:00

145 lines
3.5 KiB
YAML

storage:
type: postgres
path: "${GATUS_DATABASE_URL}"
web:
read-buffer-size: 32768
connectivity:
checker:
target: 1.1.1.1:53
interval: 60s
external-endpoints:
- name: Database
group: Services
token: "${GATUS_TOKEN}"
alerts:
- type: email
- name: Redis
group: Services
token: "${GATUS_TOKEN}"
alerts:
- type: email
endpoints:
- name: Domain
group: Expirations
url: "https://pkmntrade.club"
interval: 1h
conditions:
- "[DOMAIN_EXPIRATION] > 720h"
alerts:
- type: email
- name: Certificate
group: Expirations
url: "https://pkmntrade.club"
interval: 1h
conditions:
- "[CERTIFICATE_EXPIRATION] > 240h"
alerts:
- type: email
- name: Cloudflare
group: DNS
url: "1.1.1.1"
interval: 60s
dns:
query-name: "pkmntrade.club"
query-type: "A"
conditions:
- "[DNS_RCODE] == NOERROR"
alerts:
- type: email
- name: Google
group: DNS
url: "8.8.8.8"
interval: 60s
dns:
query-name: "pkmntrade.club"
query-type: "A"
conditions:
- "[DNS_RCODE] == NOERROR"
alerts:
- type: email
- name: Quad9
group: DNS
url: "9.9.9.9"
interval: 60s
dns:
query-name: "pkmntrade.club"
query-type: "A"
conditions:
- "[DNS_RCODE] == NOERROR"
alerts:
- type: email
- name: HAProxy
group: Load Balancer
url: "http://loba/"
interval: 60s
conditions:
- "[STATUS] == 200"
- "[BODY] == OK/HEALTHY"
alerts:
- type: email
- name: Feedback
group: Services
url: "http://feedback:3000/"
interval: 60s
conditions:
- "[STATUS] == 200"
alerts:
- type: email
{{ $all_containers := . }}
{{ $web_containers := list }}
{{ $web_staging_containers := list }}
{{ range $container := $all_containers }}
{{ $serviceLabel := index $container.Labels "com.docker.compose.service" }}
{{ if eq $serviceLabel "web" }}
{{ $web_containers = append $web_containers $container }}
{{ end }}
{{ if eq $serviceLabel "web-staging" }}
{{ $web_staging_containers = append $web_staging_containers $container }}
{{ end }}
{{ end }}
{{ $web_containers = sortObjectsByKeysAsc $web_containers "Name" }}
{{ $web_staging_containers = sortObjectsByKeysAsc $web_staging_containers "Name" }}
{{ range $container := $web_containers }}
{{ $containerNumber := index $container.Labels "com.docker.compose.container-number" }}
- name: "Web Worker {{ $containerNumber }}"
group: Main
url: "http://{{ $container.Name }}:8000/health/"
interval: 60s
conditions:
- "[STATUS] == 200"
# - "[BODY] == OK/HEALTHY"
alerts:
- type: email
{{ end }}
{{ range $container := $web_staging_containers }}
{{ $containerNumber := index $container.Labels "com.docker.compose.container-number" }}
- name: "Web Worker {{ $containerNumber }}"
group: Staging
url: "http://{{ $container.Name }}:8000/health/"
interval: 60s
conditions:
- "[STATUS] == 200"
# - "[BODY] == OK/HEALTHY"
alerts:
- type: email
{{ end }}
alerting:
email:
from: "${GATUS_SMTP_FROM}"
username: "${GATUS_SMTP_USER}"
password: "${GATUS_SMTP_PASS}"
host: "${GATUS_SMTP_HOST}"
port: ${GATUS_SMTP_PORT}
to: "${GATUS_SMTP_TO}"
client:
insecure: false
default-alert:
enabled: true
failure-threshold: 3
success-threshold: 2
send-on-resolved: true