This commit significantly improves the gatekeeper system's robustness, monitoring capabilities, and simplifies host header management for backend services.
Key changes include:
**Gatekeeper Health, Management & Resilience:**
- Implemented active health checking for individual gatekeeper containers within the `gatekeeper-manager` service.
- The manager now periodically curls the `/metrics` endpoint of each gatekeeper container.
- Reports health status to a new Gatus `services_gatekeeper` endpoint.
- Automatically attempts to restart the gatekeeper stack if any gatekeeper instance is unhealthy or if the expected number of gatekeepers is not running.
- Refactored the `gatekeeper-manager` shell script for improved state management and signal handling:
- Introduced `STARTED`, `RESTARTING`, `TERMINATING` state flags for more controlled operations.
- Enhanced SIGTERM and SIGHUP handling to gracefully manage gatekeeper lifecycles.
- Added `apk add curl` to ensure `curl` is available in the manager container.
- Renamed the gatekeeper Docker Compose template from `docker-compose_gatekeeper.template.yml` to `gatekeepers.template.yml` and its output to `gatekeepers.yml`.
- Updated `dockergen-gatekeeper` to watch the new template file and notify the correct `gatekeeper-manager` service instance (e.g., `pkmntrade-club-gatekeeper-manager-1`).
- Discover services that should be protected by looking for a `gatekeeper=true` label.
**Host Header Management & `ALLOWED_HOSTS` Simplification:**
- HAProxy configuration (`haproxy.cfg`) now consistently sets the `Host` HTTP header for requests to all backend services (e.g., `pkmntrade.club`, `staging.pkmntrade.club`). This centralizes and standardizes host information.
- Consequently, explicit `ALLOWED_HOSTS` environment variables have been removed from the `web` and `celery` service definitions in `docker-compose_web.yml` and `docker-compose_staging.yml`. Backend Django applications should now rely on the `Host` header set by HAProxy for request validation.
- The `gatekeepers.template.yml` now defines a `TARGET_HOST` environment variable for proxied services (e.g., `web`, `web-staging`). This aligns with the ALLOWED_HOSTS on the target to ensure requests aren't blocked.
**Gatus Monitoring & Configuration Updates:**
- In Gatus configuration (`gatus/config.template.yaml`):
- The "Redis" external service endpoint has been renamed to "Cache" for better clarity and to fit the theme of simple names.
- A new external service endpoint "Gatekeeper" has been added to monitor the overall health reported by the `gatekeeper-manager`.
- Health checks for "Web Worker" endpoints (both main and staging) now include the appropriate `Host` header (e.g., `Host: pkmntrade.club`) to ensure accurate health assessments by Django.
- In `docker-compose_core.yml`, the `curl` commands used by `db-redis-healthcheck` for database and cache health now append `|| true`. This prevents the script from exiting on a curl error (e.g., timeout, connection refused), ensuring that the failure is still reported to Gatus via the `success=false` parameter rather than the script terminating prematurely.
These changes collectively make the gatekeeper system more fault-tolerant, provide better visibility into its status, and streamline the configuration of backend applications by standardizing how they receive host information.
262 lines
No EOL
9.7 KiB
YAML
262 lines
No EOL
9.7 KiB
YAML
services:
|
|
db-redis-healthcheck:
|
|
image: stephenc/postgresql-cli:latest
|
|
command:
|
|
- "sh"
|
|
- "-c"
|
|
- >-
|
|
apk --no-cache add curl;
|
|
sleep 30;
|
|
while true; do
|
|
pg_output=$$(pg_isready -d ${DJANGO_DATABASE_URL} 2>&1);
|
|
pg_exit_code=$$?;
|
|
if [ $$pg_exit_code -eq 0 ]; then
|
|
pg_success="true";
|
|
pg_error="";
|
|
else
|
|
pg_success="false";
|
|
pg_error="$$pg_output";
|
|
fi;
|
|
curl -s -f -X POST \
|
|
--connect-timeout 10 \
|
|
--max-time 15 \
|
|
--header "Authorization: Bearer ${GATUS_TOKEN}" \
|
|
http://health:8080/api/v1/endpoints/services_database/external?success=$$pg_success&error=$$pg_error || true
|
|
if [ "$$pg_success" = "true" ]; then
|
|
echo " Database is OK";
|
|
else
|
|
echo "Database is not OK: $$pg_output";
|
|
exit 1;
|
|
fi;
|
|
|
|
redis_output=$$(echo -e "ping\nquit" | curl -v --max-time 10 --connect-timeout 10 telnet://redis:6379 2>&1 | grep -q "+PONG");
|
|
redis_exit_code=$$?;
|
|
if [ $$redis_exit_code -eq 0 ]; then
|
|
redis_success="true";
|
|
redis_error="";
|
|
else
|
|
redis_success="false";
|
|
redis_error="$$redis_output";
|
|
fi;
|
|
curl -s -f -X POST \
|
|
--connect-timeout 10 \
|
|
--max-time 15 \
|
|
--header "Authorization: Bearer ${GATUS_TOKEN}" \
|
|
http://health:8080/api/v1/endpoints/services_cache/external?success=$$redis_success&error=$$redis_error;
|
|
if [ "$$redis_success" = "true" ]; then
|
|
echo " Redis is OK";
|
|
else
|
|
echo "Redis is not OK: $$redis_output";
|
|
exit 1;
|
|
fi;
|
|
sleep 60;
|
|
done
|
|
env_file:
|
|
- .env
|
|
loba:
|
|
image: haproxy:3.1
|
|
stop_signal: SIGTERM
|
|
restart: always
|
|
ports:
|
|
- 443:443
|
|
env_file:
|
|
- .env
|
|
volumes:
|
|
- ./haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg
|
|
- ./certs:/certs
|
|
feedback:
|
|
restart: always
|
|
image: getfider/fider:stable
|
|
labels:
|
|
- "enable_gatekeeper=true"
|
|
env_file:
|
|
- .env
|
|
# cadvisor:
|
|
# volumes:
|
|
# - /:/rootfs:ro
|
|
# - /var/run:/var/run:ro
|
|
# - /sys:/sys:ro
|
|
# - /var/lib/docker/:/var/lib/docker:ro
|
|
# - /dev/disk/:/dev/disk:ro
|
|
# privileged: true
|
|
# devices:
|
|
# - /dev/kmsg
|
|
# image: gcr.io/cadvisor/cadvisor:v0.52.1
|
|
redis:
|
|
image: redis:latest
|
|
restart: always
|
|
healthcheck:
|
|
test: ["CMD", "redis-cli", "ping"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 10s
|
|
dockergen-health:
|
|
image: nginxproxy/docker-gen:latest
|
|
command: -wait 15s -watch /gatus/config.template.yaml /gatus/config.yaml
|
|
restart: unless-stopped
|
|
volumes:
|
|
- /var/run/docker.sock:/tmp/docker.sock:ro
|
|
- ./gatus:/gatus
|
|
dockergen-gatekeeper:
|
|
image: nginxproxy/docker-gen:latest
|
|
command: -wait 15s -watch /gatekeeper/gatekeepers.template.yml /gatekeeper/gatekeepers.yml -notify-sighup pkmntrade-club-gatekeeper-manager-1
|
|
restart: unless-stopped
|
|
volumes:
|
|
- /var/run/docker.sock:/tmp/docker.sock:ro
|
|
- ./:/gatekeeper
|
|
gatekeeper-manager:
|
|
image: docker:latest
|
|
restart: always
|
|
stop_signal: SIGTERM
|
|
volumes:
|
|
- /srv:/srv:ro
|
|
- /var/run/docker.sock:/var/run/docker.sock
|
|
environment:
|
|
- REFRESH_INTERVAL=60
|
|
entrypoint: ["/bin/sh", "-c"]
|
|
command:
|
|
- |
|
|
set -eu -o pipefail
|
|
apk add --no-cache curl
|
|
|
|
COMPOSE_FILE_PATH="/srv/pkmntrade-club/gatekeepers.yml"
|
|
PROJECT_DIR_PATH="/srv/pkmntrade-club"
|
|
PROJECT_NAME_TAG="gatekeepers"
|
|
TERMINATING="false"
|
|
RESTARTING="false"
|
|
STARTED="false"
|
|
|
|
gatekeeper_down() {
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Downing gatekeepers (Project: $$PROJECT_NAME_TAG)..."
|
|
cd "$$PROJECT_DIR_PATH"
|
|
if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" down; then
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [WARN]: 'docker compose down' for $$PROJECT_NAME_TAG encountered an issue, but proceeding."
|
|
else
|
|
STARTED="false"
|
|
fi
|
|
}
|
|
|
|
gatekeeper_up() {
|
|
if [ "$$TERMINATING" = "true" ]; then return; fi
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Upping gatekeepers (Project: $$PROJECT_NAME_TAG, File: $$COMPOSE_FILE_PATH)..."
|
|
cd "$$PROJECT_DIR_PATH"
|
|
if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" up -d --remove-orphans; then
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: 'docker compose up' for $$PROJECT_NAME_TAG failed. Will retry."
|
|
else
|
|
STARTED="true"
|
|
fi
|
|
}
|
|
|
|
restart_gatekeepers() {
|
|
if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then return; fi
|
|
RESTARTING="true"
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Restarting gatekeepers."
|
|
gatekeeper_down
|
|
gatekeeper_up
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers restarted."
|
|
RESTARTING="false"
|
|
}
|
|
|
|
gatekeeper_healthcheck() {
|
|
if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeeper Manager is terminating/restarting/not started. Skipping healthcheck."
|
|
return 0
|
|
fi
|
|
ERROR_MSG=""
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Checking gatekeepers health..."
|
|
|
|
num_containers=$$(docker ps -q -a --filter "label=gatekeeper" | wc -l)
|
|
|
|
if [ "$$num_containers" -eq 0 ]; then
|
|
ERROR_MSG="No gatekeepers found. Healthcheck failed."
|
|
elif [ $(docker ps -q -a --filter "label=gatekeeper" --filter "status=running" | wc -l) -ne "$$num_containers" ]; then
|
|
ERROR_MSG="Gatekeeper containers are missing or not running. Healthcheck failed."
|
|
else
|
|
# check for 200 status code from each gatekeeper container
|
|
for container in $$(docker ps -q -a --filter "label=gatekeeper"); do
|
|
if [ $$(curl -s -o /dev/null -w "%{http_code}" -H "X-Real-Ip: 127.0.0.1" http://$$container:9090/metrics) -ne 200 ]; then
|
|
container_name=$$(docker ps -a --filter "label=gatekeeper" --filter "id=$$container" --format "{{.Names}}")
|
|
ERROR_MSG="Gatekeeper container $$container_name is unhealthy. Healthcheck failed."
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if [ "$$ERROR_MSG" != "" ]; then
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: $$ERROR_MSG"
|
|
curl -s -f -X POST \
|
|
--connect-timeout 10 \
|
|
--max-time 15 \
|
|
--header "Authorization: Bearer ${GATUS_TOKEN}" \
|
|
"http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=false&error=$$ERROR_MSG" || true
|
|
restart_gatekeepers
|
|
return 1
|
|
else
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: All gatekeepers are OK/HEALTHY."
|
|
curl -s -f -X POST \
|
|
--connect-timeout 10 \
|
|
--max-time 15 \
|
|
--header "Authorization: Bearer ${GATUS_TOKEN}" \
|
|
http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=true&error=HEALTHY || true
|
|
fi
|
|
}
|
|
|
|
handle_sigterm() {
|
|
if [ "$$TERMINATING" = "true" ]; then return; fi
|
|
TERMINATING="true"
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGTERM received. Initiating graceful shutdown for gatekeepers."
|
|
curl -s -f -X POST \
|
|
--connect-timeout 10 \
|
|
--max-time 15 \
|
|
--header "Authorization: Bearer ${GATUS_TOKEN}" \
|
|
http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=false&error=SIGTERM%20received.%20Shutting%20down. || true
|
|
gatekeeper_down
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers shut down. Gatekeeper Manager exiting."
|
|
exit 0
|
|
}
|
|
|
|
handle_sighup() {
|
|
if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then return; fi
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGHUP received."
|
|
restart_gatekeepers
|
|
}
|
|
|
|
trap 'handle_sigterm' SIGTERM
|
|
trap 'handle_sighup' SIGHUP
|
|
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeeper Manager started."
|
|
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic refresh enabled: $$REFRESH_INTERVAL seconds. Initial wait started."
|
|
|
|
while [ "$$TERMINATING" = "false" ]; do
|
|
# 'sleep x &' and 'wait $!' allows signals to interrupt the sleep.
|
|
# '|| true' ensures the loop continues if 'wait' is killed by a handled signal (SIGHUP/SIGTERM)
|
|
# SIGTERM handler exits completely, so loop won't continue. SIGHUP handler doesn't exit.
|
|
|
|
sleep $$REFRESH_INTERVAL &
|
|
wait $! || true
|
|
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic healthcheck and refresh triggered."
|
|
|
|
if [ ! -f "$$COMPOSE_FILE_PATH" ]; then
|
|
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: Gatekeepers.yml has not been generated after $$REFRESH_INTERVAL seconds. Please check dockergen-gatekeeper is running correctly. Exiting."
|
|
exit 1
|
|
fi
|
|
|
|
if gatekeeper_healthcheck && [ "$$RESTARTING" = "false" ]; then
|
|
gatekeeper_up
|
|
fi
|
|
|
|
|
|
done
|
|
health:
|
|
image: twinproduction/gatus:latest
|
|
restart: always
|
|
labels:
|
|
- "enable_gatekeeper=true"
|
|
env_file:
|
|
- .env
|
|
environment:
|
|
- GATUS_DELAY_START_SECONDS=30
|
|
volumes:
|
|
- ./gatus:/config |