From c87d73435b47fc0664531a2d331545dac862b0f1 Mon Sep 17 00:00:00 2001 From: badbl0cks <4161747+badbl0cks@users.noreply.github.com> Date: Fri, 23 May 2025 16:16:59 -0700 Subject: [PATCH] feat: Enhance gatekeeper resilience and host handling This commit significantly improves the gatekeeper system's robustness, monitoring capabilities, and simplifies host header management for backend services. Key changes include: **Gatekeeper Health, Management & Resilience:** - Implemented active health checking for individual gatekeeper containers within the `gatekeeper-manager` service. - The manager now periodically curls the `/metrics` endpoint of each gatekeeper container. - Reports health status to a new Gatus `services_gatekeeper` endpoint. - Automatically attempts to restart the gatekeeper stack if any gatekeeper instance is unhealthy or if the expected number of gatekeepers is not running. - Refactored the `gatekeeper-manager` shell script for improved state management and signal handling: - Introduced `STARTED`, `RESTARTING`, `TERMINATING` state flags for more controlled operations. - Enhanced SIGTERM and SIGHUP handling to gracefully manage gatekeeper lifecycles. - Added `apk add curl` to ensure `curl` is available in the manager container. - Renamed the gatekeeper Docker Compose template from `docker-compose_gatekeeper.template.yml` to `gatekeepers.template.yml` and its output to `gatekeepers.yml`. - Updated `dockergen-gatekeeper` to watch the new template file and notify the correct `gatekeeper-manager` service instance (e.g., `pkmntrade-club-gatekeeper-manager-1`). - Discover services that should be protected by looking for a `gatekeeper=true` label. **Host Header Management & `ALLOWED_HOSTS` Simplification:** - HAProxy configuration (`haproxy.cfg`) now consistently sets the `Host` HTTP header for requests to all backend services (e.g., `pkmntrade.club`, `staging.pkmntrade.club`). This centralizes and standardizes host information. - Consequently, explicit `ALLOWED_HOSTS` environment variables have been removed from the `web` and `celery` service definitions in `docker-compose_web.yml` and `docker-compose_staging.yml`. Backend Django applications should now rely on the `Host` header set by HAProxy for request validation. - The `gatekeepers.template.yml` now defines a `TARGET_HOST` environment variable for proxied services (e.g., `web`, `web-staging`). This aligns with the ALLOWED_HOSTS on the target to ensure requests aren't blocked. **Gatus Monitoring & Configuration Updates:** - In Gatus configuration (`gatus/config.template.yaml`): - The "Redis" external service endpoint has been renamed to "Cache" for better clarity and to fit the theme of simple names. - A new external service endpoint "Gatekeeper" has been added to monitor the overall health reported by the `gatekeeper-manager`. - Health checks for "Web Worker" endpoints (both main and staging) now include the appropriate `Host` header (e.g., `Host: pkmntrade.club`) to ensure accurate health assessments by Django. - In `docker-compose_core.yml`, the `curl` commands used by `db-redis-healthcheck` for database and cache health now append `|| true`. This prevents the script from exiting on a curl error (e.g., timeout, connection refused), ensuring that the failure is still reported to Gatus via the `success=false` parameter rather than the script terminating prematurely. These changes collectively make the gatekeeper system more fault-tolerant, provide better visibility into its status, and streamline the configuration of backend applications by standardizing how they receive host information. --- server/docker-compose_core.yml | 111 +++++++++++++++--- server/docker-compose_staging.yml | 3 - server/docker-compose_web.yml | 2 - ....template.yml => gatekeepers.template.yml} | 8 ++ server/gatus/config.template.yaml | 19 ++- server/haproxy.cfg | 6 + 6 files changed, 123 insertions(+), 26 deletions(-) rename server/{docker-compose_gatekeeper.template.yml => gatekeepers.template.yml} (78%) diff --git a/server/docker-compose_core.yml b/server/docker-compose_core.yml index 9925cdf..b07e5de 100644 --- a/server/docker-compose_core.yml +++ b/server/docker-compose_core.yml @@ -21,7 +21,7 @@ services: --connect-timeout 10 \ --max-time 15 \ --header "Authorization: Bearer ${GATUS_TOKEN}" \ - http://health:8080/api/v1/endpoints/services_database/external?success=$$pg_success&error=$$pg_error; + http://health:8080/api/v1/endpoints/services_database/external?success=$$pg_success&error=$$pg_error || true if [ "$$pg_success" = "true" ]; then echo " Database is OK"; else @@ -42,7 +42,7 @@ services: --connect-timeout 10 \ --max-time 15 \ --header "Authorization: Bearer ${GATUS_TOKEN}" \ - http://health:8080/api/v1/endpoints/services_redis/external?success=$$redis_success&error=$$redis_error; + http://health:8080/api/v1/endpoints/services_cache/external?success=$$redis_success&error=$$redis_error; if [ "$$redis_success" = "true" ]; then echo " Redis is OK"; else @@ -100,7 +100,7 @@ services: - ./gatus:/gatus dockergen-gatekeeper: image: nginxproxy/docker-gen:latest - command: -wait 15s -watch /gatekeeper/docker-compose_gatekeeper.template.yml /gatekeeper/docker-compose_gatekeeper.yml -notify-sighup gatekeeper-manager + command: -wait 15s -watch /gatekeeper/gatekeepers.template.yml /gatekeeper/gatekeepers.yml -notify-sighup pkmntrade-club-gatekeeper-manager-1 restart: unless-stopped volumes: - /var/run/docker.sock:/tmp/docker.sock:ro @@ -118,39 +118,107 @@ services: command: - | set -eu -o pipefail + apk add --no-cache curl - COMPOSE_FILE_PATH="/srv/pkmntrade-club/docker-compose_gatekeeper.yml" + COMPOSE_FILE_PATH="/srv/pkmntrade-club/gatekeepers.yml" PROJECT_DIR_PATH="/srv/pkmntrade-club" PROJECT_NAME_TAG="gatekeepers" + TERMINATING="false" + RESTARTING="false" + STARTED="false" gatekeeper_down() { - echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Taking gatekeepers down (Project: $$PROJECT_NAME_TAG)..." + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Downing gatekeepers (Project: $$PROJECT_NAME_TAG)..." cd "$$PROJECT_DIR_PATH" if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" down; then echo "$(date +'%Y-%m-%d %H:%M:%S') [WARN]: 'docker compose down' for $$PROJECT_NAME_TAG encountered an issue, but proceeding." + else + STARTED="false" fi } gatekeeper_up() { - echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Bringing gatekeepers up/updating (Project: $$PROJECT_NAME_TAG, File: $$COMPOSE_FILE_PATH)..." + if [ "$$TERMINATING" = "true" ]; then return; fi + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Upping gatekeepers (Project: $$PROJECT_NAME_TAG, File: $$COMPOSE_FILE_PATH)..." cd "$$PROJECT_DIR_PATH" if ! docker compose -p "$$PROJECT_NAME_TAG" -f "$$COMPOSE_FILE_PATH" up -d --remove-orphans; then echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: 'docker compose up' for $$PROJECT_NAME_TAG failed. Will retry." + else + STARTED="true" + fi + } + + restart_gatekeepers() { + if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then return; fi + RESTARTING="true" + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Restarting gatekeepers." + gatekeeper_down + gatekeeper_up + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers restarted." + RESTARTING="false" + } + + gatekeeper_healthcheck() { + if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeeper Manager is terminating/restarting/not started. Skipping healthcheck." + return 0 + fi + ERROR_MSG="" + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Checking gatekeepers health..." + + num_containers=$$(docker ps -q -a --filter "label=gatekeeper" | wc -l) + + if [ "$$num_containers" -eq 0 ]; then + ERROR_MSG="No gatekeepers found. Healthcheck failed." + elif [ $(docker ps -q -a --filter "label=gatekeeper" --filter "status=running" | wc -l) -ne "$$num_containers" ]; then + ERROR_MSG="Gatekeeper containers are missing or not running. Healthcheck failed." + else + # check for 200 status code from each gatekeeper container + for container in $$(docker ps -q -a --filter "label=gatekeeper"); do + if [ $$(curl -s -o /dev/null -w "%{http_code}" -H "X-Real-Ip: 127.0.0.1" http://$$container:9090/metrics) -ne 200 ]; then + container_name=$$(docker ps -a --filter "label=gatekeeper" --filter "id=$$container" --format "{{.Names}}") + ERROR_MSG="Gatekeeper container $$container_name is unhealthy. Healthcheck failed." + fi + done + fi + + if [ "$$ERROR_MSG" != "" ]; then + echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: $$ERROR_MSG" + curl -s -f -X POST \ + --connect-timeout 10 \ + --max-time 15 \ + --header "Authorization: Bearer ${GATUS_TOKEN}" \ + "http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=false&error=$$ERROR_MSG" || true + restart_gatekeepers + return 1 + else + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: All gatekeepers are OK/HEALTHY." + curl -s -f -X POST \ + --connect-timeout 10 \ + --max-time 15 \ + --header "Authorization: Bearer ${GATUS_TOKEN}" \ + http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=true&error=HEALTHY || true fi } handle_sigterm() { + if [ "$$TERMINATING" = "true" ]; then return; fi + TERMINATING="true" echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGTERM received. Initiating graceful shutdown for gatekeepers." + curl -s -f -X POST \ + --connect-timeout 10 \ + --max-time 15 \ + --header "Authorization: Bearer ${GATUS_TOKEN}" \ + http://health:8080/api/v1/endpoints/services_gatekeeper/external?success=false&error=SIGTERM%20received.%20Shutting%20down. || true gatekeeper_down echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers shut down. Gatekeeper Manager exiting." exit 0 } handle_sighup() { - echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGHUP received. Restarting gatekeepers." - gatekeeper_down - gatekeeper_up - echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeepers restarted following SIGHUP." + if [ "$$TERMINATING" = "true" -o "$$RESTARTING" = "true" -o "$$STARTED" = "false" ]; then return; fi + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: SIGHUP received." + restart_gatekeepers } trap 'handle_sigterm' SIGTERM @@ -158,17 +226,28 @@ services: echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Gatekeeper Manager started." - echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic refresh enabled: $$REFRESH_INTERVAL seconds." - while true; do - gatekeeper_up + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic refresh enabled: $$REFRESH_INTERVAL seconds. Initial wait started." - # 'sleep 60 &' and 'wait $!' allows signals to interrupt the sleep. - sleep $$REFRESH_INTERVAL & + while [ "$$TERMINATING" = "false" ]; do + # 'sleep x &' and 'wait $!' allows signals to interrupt the sleep. # '|| true' ensures the loop continues if 'wait' is killed by a handled signal (SIGHUP/SIGTERM) # SIGTERM handler exits completely, so loop won't continue. SIGHUP handler doesn't exit. + + sleep $$REFRESH_INTERVAL & wait $! || true - echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic refresh triggered." + echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO]: Periodic healthcheck and refresh triggered." + + if [ ! -f "$$COMPOSE_FILE_PATH" ]; then + echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR]: Gatekeepers.yml has not been generated after $$REFRESH_INTERVAL seconds. Please check dockergen-gatekeeper is running correctly. Exiting." + exit 1 + fi + + if gatekeeper_healthcheck && [ "$$RESTARTING" = "false" ]; then + gatekeeper_up + fi + + done health: image: twinproduction/gatus:latest diff --git a/server/docker-compose_staging.yml b/server/docker-compose_staging.yml index 1b487bb..30cf1f4 100644 --- a/server/docker-compose_staging.yml +++ b/server/docker-compose_staging.yml @@ -3,7 +3,6 @@ x-common: &common restart: always env_file: - .env - services: web-staging: <<: *common @@ -11,7 +10,6 @@ services: - DEBUG=False - DISABLE_SIGNUPS=True - PUBLIC_HOST=staging.pkmntrade.club - - ALLOWED_HOSTS=staging.pkmntrade.club,127.0.0.1,pkmntrade-club-web-staging-1,pkmntrade-club-web-staging-2 labels: - "enable_gatekeeper=true" deploy: @@ -29,5 +27,4 @@ services: - DEBUG=False - DISABLE_SIGNUPS=True - PUBLIC_HOST=staging.pkmntrade.club - - ALLOWED_HOSTS=staging.pkmntrade.club,127.0.0.1,pkmntrade-club-celery-staging-1 command: ["celery", "-A", "pkmntrade_club.django_project", "worker", "-l", "INFO", "-B", "-E"] \ No newline at end of file diff --git a/server/docker-compose_web.yml b/server/docker-compose_web.yml index 51605e8..8ee6cac 100644 --- a/server/docker-compose_web.yml +++ b/server/docker-compose_web.yml @@ -14,7 +14,6 @@ services: - DEBUG=False - DISABLE_SIGNUPS=True - PUBLIC_HOST=pkmntrade.club - - ALLOWED_HOSTS=pkmntrade.club,127.0.0.1,pkmntrade-club-web-1,pkmntrade-club-web-2,pkmntrade-club-web-3,pkmntrade-club-web-4 labels: - "enable_gatekeeper=true" deploy: @@ -33,5 +32,4 @@ services: # - DEBUG=False # - DISABLE_SIGNUPS=True # - PUBLIC_HOST=pkmntrade.club - # - ALLOWED_HOSTS=pkmntrade.club,127.0.0.1,pkmntrade-club-celery-1,pkmntrade-club-celery-2 # command: ["celery", "-A", "pkmntrade_club.django_project", "worker", "-l", "INFO", "-B", "-E"] \ No newline at end of file diff --git a/server/docker-compose_gatekeeper.template.yml b/server/gatekeepers.template.yml similarity index 78% rename from server/docker-compose_gatekeeper.template.yml rename to server/gatekeepers.template.yml index 29a2420..701fbf1 100644 --- a/server/docker-compose_gatekeeper.template.yml +++ b/server/gatekeepers.template.yml @@ -25,6 +25,14 @@ services: - .env environment: - TARGET=http://{{ $container.Name }}{{ $port }} + {{ if eq $serviceLabel "web" }} + - TARGET_HOST=pkmntrade.club # pass this host to django, which checks it with ALLOWED_HOSTS + {{ end }} + {{ if eq $serviceLabel "web-staging" }} + - TARGET_HOST=staging.pkmntrade.club # pass this host to django, which checks it with ALLOWED_HOSTS + {{ end }} + labels: + - gatekeeper=true networks: default: aliases: diff --git a/server/gatus/config.template.yaml b/server/gatus/config.template.yaml index 0e3c7e9..33351f4 100644 --- a/server/gatus/config.template.yaml +++ b/server/gatus/config.template.yaml @@ -13,7 +13,12 @@ external-endpoints: token: "${GATUS_TOKEN}" alerts: - type: email - - name: Redis + - name: Cache + group: Services + token: "${GATUS_TOKEN}" + alerts: + - type: email + - name: Gatekeeper group: Services token: "${GATUS_TOKEN}" alerts: @@ -68,8 +73,8 @@ endpoints: - "[DNS_RCODE] == NOERROR" alerts: - type: email - - name: HAProxy - group: Load Balancer + - name: Load Balancer + group: Services url: "http://loba/" interval: 60s conditions: @@ -78,8 +83,8 @@ endpoints: alerts: - type: email - name: Feedback - group: Services - url: "http://feedback:3000/" + group: Main + url: "http://pkmntrade-club-feedback-1:3000/" interval: 60s conditions: - "[STATUS] == 200" @@ -107,6 +112,8 @@ endpoints: - name: "Web Worker {{ $containerNumber }}" group: Main url: "http://{{ $container.Name }}:8000/health/" + headers: + Host: "pkmntrade.club" interval: 60s conditions: - "[STATUS] == 200" @@ -120,6 +127,8 @@ endpoints: - name: "Web Worker {{ $containerNumber }}" group: Staging url: "http://{{ $container.Name }}:8000/health/" + headers: + Host: "staging.pkmntrade.club" interval: 60s conditions: - "[STATUS] == 200" diff --git a/server/haproxy.cfg b/server/haproxy.cfg index a06f079..14db5f6 100644 --- a/server/haproxy.cfg +++ b/server/haproxy.cfg @@ -34,16 +34,22 @@ backend basic_check backend pkmntrade.club balance leastconn + http-request set-header Host pkmntrade.club server-template gatekeeper-web- 4 gatekeeper-web:8000 check resolvers docker_resolver init-addr libc,none backend staging.pkmntrade.club balance leastconn + http-request set-header Host staging.pkmntrade.club server-template gatekeeper-web-staging- 4 gatekeeper-web-staging:8000 check resolvers docker_resolver init-addr libc,none backend feedback.pkmntrade.club + balance leastconn + http-request set-header Host feedback.pkmntrade.club server-template gatekeeper-feedback- 4 gatekeeper-feedback:8000 check resolvers docker_resolver init-addr libc,none backend health.pkmntrade.club + balance leastconn + http-request set-header Host health.pkmntrade.club server-template gatekeeper-health- 4 gatekeeper-health:8000 check resolvers docker_resolver init-addr libc,none #EOF - trailing newline required