feat: Implement dynamic Gatekeeper proxy and enhance service health monitoring

- **Implemented Dynamic Gatekeeper (Anubis) Proxy:**
  - Introduced Anubis as a Gatekeeper proxy layer for services (`web`, `web-staging`, `feedback`, `health`).
  - Added `docker-gen` setup (`docker-compose_gatekeeper.template.yml`, `gatekeeper-manager`) to dynamically configure Anubis instances based on container labels (`enable_gatekeeper=true`).
  - Updated HAProxy to route traffic through the respective Gatekeeper services.

- **Enhanced Service Health Monitoring & Checks:**
  - Integrated `django-health-check` into the Django application, providing detailed health endpoints (e.g., `/health/`).
  - Replaced the custom health check view with `django-health-check` URLs.
  - Added `psutil` for system metrics in health checks.
  - Made Gatus configuration dynamic using `docker-gen` (`config.template.yaml`), allowing automatic discovery and monitoring of service instances (e.g., web workers).
  - Externalized Gatus SMTP credentials to environment variables.
  - Strengthened `docker-compose_core.yml` with a combined `db-redis-healthcheck` service reporting to Gatus.
  - Added explicit health checks for `db` and `redis` services in `docker-compose.yml`.

- **Improved Docker & Compose Configuration:**
  - Added `depends_on` conditions in `docker-compose.yml` for `web` and `celery` services to wait for the database.
  - Updated `ALLOWED_HOSTS` in `docker-compose_staging.yml` and `docker-compose_web.yml` to include internal container names for Gatekeeper communication.
  - Set `DEBUG=False` for staging services.
  - Removed `.env.production` from `.gitignore` (standardized to `.env`).
  - Streamlined `scripts/entrypoint.sh` by removing the call to the no-longer-present `/deploy.sh`.

- **Dependency Updates:**
  - Added `django-health-check>=3.18.3` and `psutil>=7.0.0` to `pyproject.toml` and `uv.lock`.
  - Updated `settings.py` to include `health_check` apps, configuration, and use `REDIS_URL` consistently.

- **Streamlined deployment script used in GHA:**
  - Updated the workflow to copy new server files and create a new `.env` file in the temporary directory before moving them into place.
  - Consolidated the stopping and removal of old containers into a single step for better clarity and efficiency.
  - Reduce container downtime by rearranging stop/start steps.
This commit is contained in:
badblocks 2025-05-22 19:21:58 -07:00
parent f530790f6c
commit 6aa15d1af9
No known key found for this signature in database
16 changed files with 487 additions and 162 deletions

View file

@ -1,5 +1,5 @@
services:
db-healthcheck:
db-redis-healthcheck:
image: stephenc/postgresql-cli:latest
command:
- "sh"
@ -9,26 +9,47 @@ services:
sleep 30;
while true; do
pg_output=$$(pg_isready -d ${DJANGO_DATABASE_URL} 2>&1);
exit_code=$$?;
if [ $$exit_code -eq 0 ]; then
success="true";
error="";
pg_exit_code=$$?;
if [ $$pg_exit_code -eq 0 ]; then
pg_success="true";
pg_error="";
else
success="false";
error="$$pg_output";
pg_success="false";
pg_error="$$pg_output";
fi;
curl -s -f -X POST \
--connect-timeout 10 \
--max-time 15 \
--header "Authorization: Bearer ${GATUS_TOKEN}" \
http://health:8080/api/v1/endpoints/db_pg-isready/external?success=$$success&error=$$error;
if [ "$$success" = "true" ]; then
http://health:8080/api/v1/endpoints/services_database/external?success=$$pg_success&error=$$pg_error;
if [ "$$pg_success" = "true" ]; then
echo " Database is OK";
sleep 60;
else
echo "Database is not OK: $$pg_output";
exit 1;
fi;
redis_output=$$(echo -e "ping\nquit" | curl -v --max-time 10 --connect-timeout 10 telnet://redis:6379 2>&1 | grep -q "+PONG");
redis_exit_code=$$?;
if [ $$redis_exit_code -eq 0 ]; then
redis_success="true";
redis_error="";
else
redis_success="false";
redis_error="$$redis_output";
fi;
curl -s -f -X POST \
--connect-timeout 10 \
--max-time 15 \
--header "Authorization: Bearer ${GATUS_TOKEN}" \
http://health:8080/api/v1/endpoints/services_redis/external?success=$$redis_success&error=$$redis_error;
if [ "$$redis_success" = "true" ]; then
echo " Redis is OK";
else
echo "Redis is not OK: $$redis_output";
exit 1;
fi;
sleep 60;
done
env_file:
- .env
@ -46,41 +67,114 @@ services:
feedback:
restart: always
image: getfider/fider:stable
labels:
- "enable_gatekeeper=true"
env_file:
- .env
cadvisor:
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
image: gcr.io/cadvisor/cadvisor:v0.52.1
# cadvisor:
# volumes:
# - /:/rootfs:ro
# - /var/run:/var/run:ro
# - /sys:/sys:ro
# - /var/lib/docker/:/var/lib/docker:ro
# - /dev/disk/:/dev/disk:ro
# privileged: true
# devices:
# - /dev/kmsg
# image: gcr.io/cadvisor/cadvisor:v0.52.1
redis:
image: redis:latest
restart: always
ports:
- 6379:6379
# anubis:
# image: ghcr.io/techarohq/anubis:latest
# env_file:
# - .env
# dockergen:
# image: jwilder/docker-gen:latest
# container_name: dockergen_gatus_config
# command: -watch -notify-sighup gatus_service -only-exposed /app/config.template.yml /app/config.yaml
# restart: unless-stopped
# volumes:
# - /var/run/docker.sock:/tmp/docker.sock:ro
# - ./gatus:/app
# depends_on:
# - health
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
dockergen-health:
image: nginxproxy/docker-gen:latest
command: -wait 15s -watch /gatus/config.template.yaml /gatus/config.yaml
restart: unless-stopped
volumes:
- /var/run/docker.sock:/tmp/docker.sock:ro
- ./gatus:/gatus
dockergen-gatekeeper:
image: nginxproxy/docker-gen:latest
command: -wait 15s -watch /gatekeeper/docker-compose_gatekeeper.template.yml /gatekeeper/docker-compose_gatekeeper.yml -notify-sighup gatekeeper-manager
restart: unless-stopped
volumes:
- /var/run/docker.sock:/tmp/docker.sock:ro
- ./:/gatekeeper
gatekeeper-manager:
image: docker:latest
restart: always
stop_signal: SIGTERM
volumes:
- /srv:/srv:ro
- /var/run/docker.sock:/var/run/docker.sock
environment:
- REFRESH_INTERVAL=60
entrypoint: ["/bin/sh", "-c"]
command:
- |
set -eu -o pipefail
COMPOSE_FILE_PATH="/srv/pkmntrade-club/docker-compose_gatekeeper.yml"
PROJECT_DIR_PATH="/srv/pkmntrade-club"
PROJECT_NAME_TAG="gatekeepers"
gatekeeper_down() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Taking gatekeepers down (Project: $$PROJECT_NAME_TAG)..."
cd "$$PROJECT_DIR_PATH"
if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" down; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [WARN]: 'docker compose down' for $$PROJECT_NAME_TAG encountered an issue, but proceeding."
fi
}
gatekeeper_up() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Bringing gatekeepers up/updating (Project: $$PROJECT_NAME_TAG, File: $$COMPOSE_FILE_PATH)..."
cd "$$PROJECT_DIR_PATH"
if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" up -d --remove-orphans; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: 'docker compose up' for $$PROJECT_NAME_TAG failed. Will retry."
fi
}
handle_sigterm() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGTERM received. Initiating graceful shutdown for gatekeepers."
gatekeeper_down
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers shut down. Gatekeeper Manager exiting."
exit 0
}
handle_sighup() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGHUP received. Restarting gatekeepers."
gatekeeper_down
gatekeeper_up
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers restarted following SIGHUP."
}
trap 'handle_sigterm' SIGTERM
trap 'handle_sighup' SIGHUP
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeeper Manager started."
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic refresh enabled: $$REFRESH_INTERVAL seconds."
while true; do
gatekeeper_up
# 'sleep 60 &' and 'wait $!' allows signals to interrupt the sleep.
sleep $$REFRESH_INTERVAL &
# '|| true' ensures the loop continues if 'wait' is killed by a handled signal (SIGHUP/SIGTERM)
# SIGTERM handler exits completely, so loop won't continue. SIGHUP handler doesn't exit.
wait $! || true
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic refresh triggered."
done
health:
image: twinproduction/gatus:latest
restart: always
labels:
- "enable_gatekeeper=true"
env_file:
- .env
environment: