pkmntrade.club/server/docker-compose_core.yml
badbl0cks c87d73435b
feat: Enhance gatekeeper resilience and host handling
This commit significantly improves the gatekeeper system's robustness, monitoring capabilities, and simplifies host header management for backend services.

Key changes include:

**Gatekeeper Health, Management & Resilience:**
- Implemented active health checking for individual gatekeeper containers within the `gatekeeper-manager` service.
    - The manager now periodically curls the `/metrics` endpoint of each gatekeeper container.
    - Reports health status to a new Gatus `services_gatekeeper` endpoint.
    - Automatically attempts to restart the gatekeeper stack if any gatekeeper instance is unhealthy or if the expected number of gatekeepers is not running.
- Refactored the `gatekeeper-manager` shell script for improved state management and signal handling:
    - Introduced `STARTED`, `RESTARTING`, `TERMINATING` state flags for more controlled operations.
    - Enhanced SIGTERM and SIGHUP handling to gracefully manage gatekeeper lifecycles.
    - Added `apk add curl` to ensure `curl` is available in the manager container.
- Renamed the gatekeeper Docker Compose template from `docker-compose_gatekeeper.template.yml` to `gatekeepers.template.yml` and its output to `gatekeepers.yml`.
- Updated `dockergen-gatekeeper` to watch the new template file and notify the correct `gatekeeper-manager` service instance (e.g., `pkmntrade-club-gatekeeper-manager-1`).
- Discover services that should be protected by looking for a `gatekeeper=true` label.

**Host Header Management & `ALLOWED_HOSTS` Simplification:**
- HAProxy configuration (`haproxy.cfg`) now consistently sets the `Host` HTTP header for requests to all backend services (e.g., `pkmntrade.club`, `staging.pkmntrade.club`). This centralizes and standardizes host information.
- Consequently, explicit `ALLOWED_HOSTS` environment variables have been removed from the `web` and `celery` service definitions in `docker-compose_web.yml` and `docker-compose_staging.yml`. Backend Django applications should now rely on the `Host` header set by HAProxy for request validation.
- The `gatekeepers.template.yml` now defines a `TARGET_HOST` environment variable for proxied services (e.g., `web`, `web-staging`). This aligns with the ALLOWED_HOSTS on the target to ensure requests aren't blocked.

**Gatus Monitoring & Configuration Updates:**
- In Gatus configuration (`gatus/config.template.yaml`):
    - The "Redis" external service endpoint has been renamed to "Cache" for better clarity and to fit the theme of simple names.
    - A new external service endpoint "Gatekeeper" has been added to monitor the overall health reported by the `gatekeeper-manager`.
    - Health checks for "Web Worker" endpoints (both main and staging) now include the appropriate `Host` header (e.g., `Host: pkmntrade.club`) to ensure accurate health assessments by Django.
- In `docker-compose_core.yml`, the `curl` commands used by `db-redis-healthcheck` for database and cache health now append `|| true`. This prevents the script from exiting on a curl error (e.g., timeout, connection refused), ensuring that the failure is still reported to Gatus via the `success=false` parameter rather than the script terminating prematurely.

These changes collectively make the gatekeeper system more fault-tolerant, provide better visibility into its status, and streamline the configuration of backend applications by standardizing how they receive host information.
2025-05-23 16:16:59 -07:00

262 lines
No EOL
9.7 KiB
YAML

services:
db-redis-healthcheck:
image: stephenc/postgresql-cli:latest
command:
- "sh"
- "-c"
- >-
apk --no-cache add curl;
sleep 30;
while true; do
pg_output=$$(pg_isready -d ${DJANGO_DATABASE_URL} 2>&1);
pg_exit_code=$$?;
if [ $$pg_exit_code -eq 0 ]; then
pg_success="true";
pg_error="";
else
pg_success="false";
pg_error="$$pg_output";
fi;
curl -s -f -X POST \
--connect-timeout 10 \
--max-time 15 \
--header "Authorization: Bearer ${GATUS_TOKEN}" \
http://health:8080/api/v1/endpoints/services_database/external?success=$$pg_success&error=$$pg_error || true
if [ "$$pg_success" = "true" ]; then
echo " Database is OK";
else
echo "Database is not OK: $$pg_output";
exit 1;
fi;
redis_output=$$(echo -e "ping\nquit" | curl -v --max-time 10 --connect-timeout 10 telnet://redis:6379 2>&1 | grep -q "+PONG");
redis_exit_code=$$?;
if [ $$redis_exit_code -eq 0 ]; then
redis_success="true";
redis_error="";
else
redis_success="false";
redis_error="$$redis_output";
fi;
curl -s -f -X POST \
--connect-timeout 10 \
--max-time 15 \
--header "Authorization: Bearer ${GATUS_TOKEN}" \
http://health:8080/api/v1/endpoints/services_cache/external?success=$$redis_success&error=$$redis_error;
if [ "$$redis_success" = "true" ]; then
echo " Redis is OK";
else
echo "Redis is not OK: $$redis_output";
exit 1;
fi;
sleep 60;
done
env_file:
- .env
loba:
image: haproxy:3.1
stop_signal: SIGTERM
restart: always
ports:
- 443:443
env_file:
- .env
volumes:
- ./haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg
- ./certs:/certs
feedback:
restart: always
image: getfider/fider:stable
labels:
- "enable_gatekeeper=true"
env_file:
- .env
# cadvisor:
# volumes:
# - /:/rootfs:ro
# - /var/run:/var/run:ro
# - /sys:/sys:ro
# - /var/lib/docker/:/var/lib/docker:ro
# - /dev/disk/:/dev/disk:ro
# privileged: true
# devices:
# - /dev/kmsg
# image: gcr.io/cadvisor/cadvisor:v0.52.1
redis:
image: redis:latest
restart: always
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
dockergen-health:
image: nginxproxy/docker-gen:latest
command: -wait 15s -watch /gatus/config.template.yaml /gatus/config.yaml
restart: unless-stopped
volumes:
- /var/run/docker.sock:/tmp/docker.sock:ro
- ./gatus:/gatus
dockergen-gatekeeper:
image: nginxproxy/docker-gen:latest
command: -wait 15s -watch /gatekeeper/gatekeepers.template.yml /gatekeeper/gatekeepers.yml -notify-sighup pkmntrade-club-gatekeeper-manager-1
restart: unless-stopped
volumes:
- /var/run/docker.sock:/tmp/docker.sock:ro
- ./:/gatekeeper
gatekeeper-manager:
image: docker:latest
restart: always
stop_signal: SIGTERM
volumes:
- /srv:/srv:ro
- /var/run/docker.sock:/var/run/docker.sock
environment:
- REFRESH_INTERVAL=60
entrypoint: ["/bin/sh", "-c"]
command:
- |
set -eu -o pipefail
apk add --no-cache curl
COMPOSE_FILE_PATH="/srv/pkmntrade-club/gatekeepers.yml"
PROJECT_DIR_PATH="/srv/pkmntrade-club"
PROJECT_NAME_TAG="gatekeepers"
TERMINATING="false"
RESTARTING="false"
STARTED="false"
gatekeeper_down() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Downing gatekeepers (Project: $$PROJECT_NAME_TAG)..."
cd "$$PROJECT_DIR_PATH"
if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" down; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [WARN]: 'docker compose down' for $$PROJECT_NAME_TAG encountered an issue, but proceeding."
else
STARTED="false"
fi
}
gatekeeper_up() {
if [ "$$TERMINATING" = "true" ]; then return; fi
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Upping gatekeepers (Project: $$PROJECT_NAME_TAG, File: $$COMPOSE_FILE_PATH)..."
cd "$$PROJECT_DIR_PATH"
if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" up -d --remove-orphans; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: 'docker compose up' for $$PROJECT_NAME_TAG failed. Will retry."
else
STARTED="true"
fi
}
restart_gatekeepers() {
if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then return; fi
RESTARTING="true"
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Restarting gatekeepers."
gatekeeper_down
gatekeeper_up
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers restarted."
RESTARTING="false"
}
gatekeeper_healthcheck() {
if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeeper Manager is terminating/restarting/not started. Skipping healthcheck."
return 0
fi
ERROR_MSG=""
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Checking gatekeepers health..."
num_containers=$$(docker ps -q -a --filter "label=gatekeeper" | wc -l)
if [ "$$num_containers" -eq 0 ]; then
ERROR_MSG="No gatekeepers found. Healthcheck failed."
elif [ $(docker ps -q -a --filter "label=gatekeeper" --filter "status=running" | wc -l) -ne "$$num_containers" ]; then
ERROR_MSG="Gatekeeper containers are missing or not running. Healthcheck failed."
else
# check for 200 status code from each gatekeeper container
for container in $$(docker ps -q -a --filter "label=gatekeeper"); do
if [ $$(curl -s -o /dev/null -w "%{http_code}" -H "X-Real-Ip: 127.0.0.1" http://$$container:9090/metrics) -ne 200 ]; then
container_name=$$(docker ps -a --filter "label=gatekeeper" --filter "id=$$container" --format "{{.Names}}")
ERROR_MSG="Gatekeeper container $$container_name is unhealthy. Healthcheck failed."
fi
done
fi
if [ "$$ERROR_MSG" != "" ]; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: $$ERROR_MSG"
curl -s -f -X POST \
--connect-timeout 10 \
--max-time 15 \
--header "Authorization: Bearer ${GATUS_TOKEN}" \
"http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=false&error=$$ERROR_MSG" || true
restart_gatekeepers
return 1
else
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: All gatekeepers are OK/HEALTHY."
curl -s -f -X POST \
--connect-timeout 10 \
--max-time 15 \
--header "Authorization: Bearer ${GATUS_TOKEN}" \
http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=true&error=HEALTHY || true
fi
}
handle_sigterm() {
if [ "$$TERMINATING" = "true" ]; then return; fi
TERMINATING="true"
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGTERM received. Initiating graceful shutdown for gatekeepers."
curl -s -f -X POST \
--connect-timeout 10 \
--max-time 15 \
--header "Authorization: Bearer ${GATUS_TOKEN}" \
http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=false&error=SIGTERM%20received.%20Shutting%20down. || true
gatekeeper_down
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers shut down. Gatekeeper Manager exiting."
exit 0
}
handle_sighup() {
if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then return; fi
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGHUP received."
restart_gatekeepers
}
trap 'handle_sigterm' SIGTERM
trap 'handle_sighup' SIGHUP
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeeper Manager started."
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic refresh enabled: $$REFRESH_INTERVAL seconds. Initial wait started."
while [ "$$TERMINATING" = "false" ]; do
# 'sleep x &' and 'wait $!' allows signals to interrupt the sleep.
# '|| true' ensures the loop continues if 'wait' is killed by a handled signal (SIGHUP/SIGTERM)
# SIGTERM handler exits completely, so loop won't continue. SIGHUP handler doesn't exit.
sleep $$REFRESH_INTERVAL &
wait $! || true
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic healthcheck and refresh triggered."
if [ ! -f "$$COMPOSE_FILE_PATH" ]; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: Gatekeepers.yml has not been generated after $$REFRESH_INTERVAL seconds. Please check dockergen-gatekeeper is running correctly. Exiting."
exit 1
fi
if gatekeeper_healthcheck && [ "$$RESTARTING" = "false" ]; then
gatekeeper_up
fi
done
health:
image: twinproduction/gatus:latest
restart: always
labels:
- "enable_gatekeeper=true"
env_file:
- .env
environment:
- GATUS_DELAY_START_SECONDS=30
volumes:
- ./gatus:/config