devops/quadlets/modules/hetzner/cloud-init.yml

562 lines
23 KiB
YAML

#cloud-config
users:
- name: fourlights
sudo: ALL=(ALL) NOPASSWD:ALL
groups: users,admin,sudo
shell: /bin/bash
lock_passwd: false
ssh_authorized_keys:
- ${ssh_public_key}
packages:
- podman
- haproxy
- python3
- python3-requests
- curl
- wget
- jq
- socat
- nmap
package_update: true
package_upgrade: true
write_files:
- path: /etc/sudoers.d/fourlights-haproxy
permissions: '0440'
content: |
fourlights ALL=(root) NOPASSWD: /bin/systemctl reload haproxy
fourlights ALL=(root) NOPASSWD: /bin/systemctl restart haproxy
fourlights ALL=(root) NOPASSWD: /bin/systemctl stop haproxy
fourlights ALL=(root) NOPASSWD: /bin/systemctl start haproxy
fourlights ALL=(root) NOPASSWD: /bin/chown -R haproxy\:haproxy /etc/ssl/haproxy/*
fourlights ALL=(root) NOPASSWD: /bin/chmod 600 /etc/ssl/haproxy/*
# HAProxy main configuration
- path: /etc/haproxy/haproxy.cfg
content: |
global
daemon
stats socket /var/run/haproxy/admin.sock mode 660 level admin expose-fd listeners
stats timeout 30s
user haproxy
group haproxy
log stdout local0 info
defaults
mode http
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms
option httplog
log global
# Stats interface
frontend stats
bind *:8404
http-request use-service prometheus-exporter if { path /metrics }
stats enable
stats uri /stats
stats refresh 10s
# HTTP Frontend
frontend main
bind *:80
# ACL to detect ACME challenge requests
acl is_acme_challenge path_beg /.well-known/acme-challenge/
# Route ACME challenges to the acme_challenge backend
use_backend acme_challenge if is_acme_challenge
default_backend no_match
# HTTPS Frontend
frontend https_main
bind *:443
default_backend no_match
# ACME Challenge Backend
backend acme_challenge
mode http
server acme_server 127.0.0.1:8888
# Default backend
backend no_match
http-request return status 404 content-type text/plain string "No matching service found"
- path: /etc/dataplaneapi/dataplaneapi.yml
content: |
dataplaneapi:
host: 0.0.0.0
port: 5555
user:
- insecure: true
password: admin
username: admin
haproxy:
config_file: /etc/haproxy/haproxy.cfg
haproxy_bin: /usr/sbin/haproxy
reload:
reload_cmd: systemctl reload haproxy
restart_cmd: systemctl restart haproxy
stats_socket: /var/run/haproxy/admin.sock
- path: /usr/local/bin/podman-haproxy-acme-sync-wrapper.sh
permissions: '0755'
content: |
#!/bin/bash
set -e
MAX_WAIT=60
ELAPSED=0
# Wait for HAProxy
echo "Checking HAProxy status..."
while ! systemctl is-active --quiet haproxy; do
echo "Waiting for HAProxy to start..."
sleep 2
ELAPSED=$($ELAPSED + 2)
[ $ELAPSED -ge $MAX_WAIT ] && { echo "ERROR: HAProxy timeout"; exit 1; }
done
echo "HAProxy is active"
# Reset and wait for Data Plane API to actually respond
ELAPSED=0
echo "Checking Data Plane API readiness..."
while true; do
HTTP_CODE=$(curl -s -w "%%{http_code}" -o /dev/null \
--connect-timeout 5 \
--max-time 10 \
-u :admin \
http://localhost:5555/v3/services/haproxy/configuration/version 2>/dev/null || echo "000")
[ "$HTTP_CODE" = "200" ] && { echo "Data Plane API ready"; break; }
echo "Waiting for Data Plane API... (HTTP $HTTP_CODE)"
sleep 2
ELAPSED=$((ELAPSED + 2))
if [ $ELAPSED -ge $MAX_WAIT ]; then
echo "ERROR: Data Plane API not ready within $MAX_WAITs (HTTP $HTTP_CODE)"
journalctl -u dataplaneapi -n 50 --no-pager
exit 1
fi
done
sleep 2
exec /usr/local/bin/podman-haproxy-acme-sync.py
# Podman HAProxy ACME Sync Script
- path: /usr/local/bin/podman-haproxy-acme-sync.py
permissions: '0755'
content: |
#!/usr/bin/env python3
import json
import subprocess
import requests
import time
import os
import sys
HAPROXY_API_BASE = "http://:admin@127.0.0.1:5555/v3"
CERT_DIR = "/home/fourlights/.acme.sh"
ACME_SCRIPT = "/usr/local/bin/acme.sh"
class PodmanHAProxyACMESync:
def __init__(self):
self.ssl_services = set()
self.session = requests.Session()
self.session.headers.update({'Content-Type': 'application/json'})
def get_next_index(self, path):
response = self.session.get(f"{HAPROXY_API_BASE}/services/haproxy/configuration/{path}")
return len(response.json()) if response.status_code == 200 else None
def get_dataplaneapi_version(self):
response = self.session.get(f"{HAPROXY_API_BASE}/services/haproxy/configuration/version")
return response.json() if response.status_code == 200 else None
def get_container_labels(self, container_id):
try:
result = subprocess.run(['podman', 'inspect', container_id],
capture_output=True, text=True)
if result.returncode == 0:
data = json.loads(result.stdout)
return data[0]['Config']['Labels'] or {}
except Exception as e:
print(f"Error getting labels for {container_id}: {e}")
return {}
def request_certificate(self, domain):
print(f"[CERT-REQUEST] About to request certificate for {domain}")
sys.stdout.flush()
try:
cmd = [
ACME_SCRIPT,
"--issue",
"-d", domain,
"--standalone",
"--httpport", "8888",
"--server", "letsencrypt",
"--listen-v4",
"--debug", "2"
]
# Log the command being executed
print(f"[CERT-REQUEST] Executing: {' '.join(cmd)}")
sys.stdout.flush()
result = subprocess.run(cmd, capture_output=True, text=True)
# Log both stdout and stderr for complete debugging
if result.stdout:
print(f"[CERT-STDOUT] {result.stdout}")
sys.stdout.flush()
if result.stderr:
print(f"[CERT-STDERR] {result.stderr}")
sys.stderr.flush()
if result.returncode == 0:
print(f"[CERT-SUCCESS] Certificate obtained for {domain}")
sys.stdout.flush()
self.install_certificate(domain)
return True
else:
print(f"[CERT-FAILED] Failed to obtain certificate for {domain}")
print(f"[CERT-FAILED] Return code: {result.returncode}")
sys.stdout.flush()
return False
except Exception as e:
print(f"[CERT-ERROR] Error requesting certificate: {e}")
sys.stdout.flush()
return False
def install_certificate(self, domain):
cert_file = f"{CERT_DIR}/{domain}.pem"
try:
acme_cert_dir = f"/home/fourlights/.acme.sh/{domain}_ecc"
with open(cert_file, 'w') as outfile:
with open(f"{acme_cert_dir}/fullchain.cer") as cert:
outfile.write(cert.read())
with open(f"{acme_cert_dir}/{domain}.key") as key:
outfile.write(key.read())
try:
with open(f"{acme_cert_dir}/ca.cer") as ca:
outfile.write(ca.read())
except FileNotFoundError:
pass
os.chmod(cert_file, 0o600)
print(f"Certificate installed at {cert_file}")
self.update_haproxy_ssl_bind(domain)
except Exception as e:
print(f"Error installing certificate for {domain}: {e}")
def update_haproxy_ssl_bind(self, domain):
print(f"Updating ssl bind for {domain}")
try:
ssl_bind_data = {
"address": "*",
"port": 443,
"ssl": True,
"ssl_certificate": f"{CERT_DIR}/{domain}.pem",
}
response = self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/frontends/https_main/binds?version={self.get_dataplaneapi_version()}",
json=ssl_bind_data)
print(response.json())
if response.status_code in [200, 201]:
print(f"Updated HAProxy SSL bind for {domain}")
except Exception as e:
print(f"Error updating HAProxy SSL bind: {e}")
def setup_certificate_renewal(self, domain):
renewal_script = f"/etc/cron.d/acme-{domain.replace('.', '-')}"
cron_content = f"""0 0 * * * root {ACME_SCRIPT} --renew -d {domain} --post-hook "systemctl reload haproxy" >/dev/null 2>&1
"""
with open(renewal_script, 'w') as f:
f.write(cron_content)
print(f"Setup automatic renewal for {domain}")
def update_haproxy_backend(self, service_name, host, port, action='add'):
backend_name = f"backend_{service_name}"
server_name = f"{service_name}_server"
if action == 'add':
backend_data = {
"name": backend_name,
"mode": "http",
"balance": {"algorithm": "roundrobin"},
}
backends = self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/backends?version={self.get_dataplaneapi_version()}",
json=backend_data)
print(backends.json())
server_data = {
"name": server_name,
"address": host,
"port": int(port),
"check": "enabled",
}
tweak = self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/backends/{backend_name}/servers?version={self.get_dataplaneapi_version()}",
json=server_data)
print(tweak.json())
elif action == 'remove':
self.session.delete(f"{HAPROXY_API_BASE}/services/haproxy/configuration/backends/{backend_name}/servers/{server_name}?version={self.get_dataplaneapi_version()}")
def update_haproxy_frontend_rule(self, service_name, domain, ssl_enabled=False, action='add'):
if action == 'add':
if ssl_enabled and domain and domain not in self.ssl_services:
print(f"Setting up SSL for {domain}")
if self.request_certificate(domain):
self.setup_certificate_renewal(domain)
self.ssl_services.add(domain)
acl_data = {
"acl_name": f"is_{service_name}",
"criterion": "hdr(host)",
"value": domain,
}
self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/frontends/main/acls/{self.get_next_index('frontends/main/acls')}?version={self.get_dataplaneapi_version()}",
json=acl_data)
if ssl_enabled:
self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/frontends/https_main/acls/{self.get_next_index('frontends/https_main/acls')}?version={self.get_dataplaneapi_version()}",
json=acl_data)
rule_data = {
"name": f"backend_{service_name}",
"cond": "if",
"cond_test": f"is_{service_name}",
}
self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/frontends/main/backend_switching_rules/{self.get_next_index('frontends/main/backend_switching_rules')}?version={self.get_dataplaneapi_version()}",
json=rule_data)
if ssl_enabled:
self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/frontends/https_main/backend_switching_rules/{self.get_next_index('frontends/https_main/backend_switching_rules')}?version={self.get_dataplaneapi_version()}",
json=rule_data)
redirect_rule = {
"type": "redirect",
"redirect_rule": {
"type": "scheme",
"value": "https",
"code": 301
},
"cond": "if",
"cond_test": f"is_{service_name}",
}
self.session.post(f"{HAPROXY_API_BASE}/services/haproxy/configuration/frontends/main/http_request_rules/{self.get_next_index('frontends/main/http_request_rules')}?version={self.get_dataplaneapi_version()}",
json=redirect_rule)
def process_container_event(self, event):
# DIAGNOSTIC: Log raw event structure
print(f"[EVENT-DEBUG] Received event - Type: {event.get('Type', 'MISSING')}, Action: {event.get('Action', 'MISSING')}")
sys.stdout.flush()
# DIAGNOSTIC: Check for Actor key
if 'Actor' not in event:
print(f"[EVENT-SKIP] Skipping event without 'Actor' key - Full event: {json.dumps(event)}")
sys.stdout.flush()
return
# DIAGNOSTIC: Check for ID in Actor
if 'ID' not in event['Actor']:
print(f"[EVENT-SKIP] Skipping event without 'Actor.ID' - Actor content: {json.dumps(event['Actor'])}")
sys.stdout.flush()
return
container_id = event['Actor']['ID'][:12]
action = event['Action']
print(f"[EVENT-PROCESS] Processing '{action}' event for container {container_id}")
sys.stdout.flush()
labels = self.get_container_labels(container_id)
# Dictionary to store discovered services
services = {}
# First, check for namespaced labels (haproxy.{service_name}.enable)
for label_key, label_value in labels.items():
if label_key.startswith('haproxy.') and label_key.endswith('.enable') and label_value.lower() == 'true':
# Extract service name from label key
parts = label_key.split('.')
if len(parts) == 3: # haproxy.{service_name}.enable
service_name = parts[1]
# Extract properties for this service namespace
service_config = {
'service_name': service_name,
'host': labels.get(f'haproxy.{service_name}.host', '127.0.0.1'),
'port': labels.get(f'haproxy.{service_name}.port', '8080'),
'domain': labels.get(f'haproxy.{service_name}.domain', None),
'ssl_enabled': labels.get(f'haproxy.{service_name}.tls', 'false').lower() == 'true'
}
services[service_name] = service_config
# Backward compatibility: If no namespaced labels found, check for flat labels
if not services and 'haproxy.enable' in labels and labels['haproxy.enable'].lower() == 'true':
service_name = labels.get('haproxy.service', container_id)
services[service_name] = {
'service_name': service_name,
'host': labels.get('haproxy.host', '127.0.0.1'),
'port': labels.get('haproxy.port', '8080'),
'domain': labels.get('haproxy.domain', None),
'ssl_enabled': labels.get('haproxy.tls', 'false').lower() == 'true'
}
# Process each discovered service
for service_name, config in services.items():
if action in ['start', 'restart']:
print(f"Adding service {config['service_name']} to HAProxy (SSL: {config['ssl_enabled']}, Domain: {config['domain']})")
sys.stdout.flush()
self.update_haproxy_backend(config['service_name'], config['host'], config['port'], 'add')
if config['domain']:
self.update_haproxy_frontend_rule(config['service_name'], config['domain'], config['ssl_enabled'], 'add')
elif action in ['stop', 'remove', 'died']:
print(f"Removing service {config['service_name']} from HAProxy")
sys.stdout.flush()
self.update_haproxy_backend(config['service_name'], config['host'], config['port'], 'remove')
def watch_events(self):
print("Starting Podman-HAProxy-ACME sync...")
# Track last sync time
last_full_sync = 0
SYNC_INTERVAL = 60 # Re-scan all containers every 60 seconds
def do_full_sync():
"""Perform a full sync of all running containers"""
print("Performing full container sync...")
try:
result = subprocess.run(['podman', 'ps', '--format', 'json'],
capture_output=True, text=True)
if result.returncode == 0:
containers = json.loads(result.stdout)
for container in containers:
event = {
'Type': 'container',
'Action': 'start',
'Actor': {'ID': container.get('Id', '')}
}
self.process_container_event(event)
print(f"Synced {len(containers)} containers")
except Exception as e:
print(f"Error during full sync: {e}")
# Initial sync
do_full_sync()
last_full_sync = time.time()
print("Watching for container events...")
cmd = ['podman', 'events', '--format', 'json']
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True)
# Use select/poll for non-blocking read so we can do periodic syncs
import select
while True:
# Check if it's time for periodic sync
if time.time() - last_full_sync >= SYNC_INTERVAL:
do_full_sync()
last_full_sync = time.time()
# Check for events with timeout
ready, _, _ = select.select([process.stdout], [], [], 5)
if ready:
line = process.stdout.readline()
if line:
try:
event = json.loads(line.strip())
if event['Type'] == 'container':
self.process_container_event(event)
except json.JSONDecodeError as e:
print(f"[EVENT-ERROR] JSON decode error: {e} - Line: {line[:100]}")
sys.stdout.flush()
except KeyError as e:
print(f"[EVENT-ERROR] Missing key {e} in event: {json.dumps(event)}")
sys.stdout.flush()
except Exception as e:
print(f"[EVENT-ERROR] Error processing event: {e}")
print(f"[EVENT-ERROR] Event structure: {json.dumps(event)}")
sys.stdout.flush()
if __name__ == "__main__":
os.makedirs(CERT_DIR, exist_ok=True)
sync = PodmanHAProxyACMESync()
sync.watch_events()
runcmd:
# Create necessary directories
- mkdir -p /var/run/haproxy /etc/ssl/haproxy /etc/containers/systemd /etc/haproxy/dataplane /etc/dataplaneapi
- chown haproxy:haproxy /var/run/haproxy
# Install Data Plane API
- cd /tmp && curl -LO https://github.com/haproxytech/dataplaneapi/releases/download/v3.2.4/dataplaneapi_3.2.4_linux_amd64.deb
- env DEBIAN_FRONTEND=noninteractive apt install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" /tmp/dataplaneapi_3.2.4_linux_amd64.deb
- rm /tmp/dataplaneapi_3.2.4_linux_amd64.deb
- mkdir -p /home/fourlights/.config/containers/systemd
- mkdir -p /home/fourlights/.config/systemd/user
- |
cat > /home/fourlights/.config/systemd/user/podman-haproxy-acme-sync.service << 'EOF'
[Unit]
Description=Podman HAProxy ACME Sync Service
After=network.target
[Service]
Type=simple
Environment="XDG_RUNTIME_DIR=/run/user/1000"
Environment="DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus"
ExecStart=/usr/local/bin/podman-haproxy-acme-sync-wrapper.sh
StandardOutput=journal
StandardError=journal
Restart=always
RestartSec=10
[Install]
WantedBy=default.target
EOF
- chown -R fourlights:fourlights /home/fourlights
# Install ACME.sh
- su - fourlights -c 'curl https://get.acme.sh | sh -s email=${acme_email}'
- ln -sf /home/fourlights/.acme.sh/acme.sh /usr/local/bin/acme.sh
# Setup data directory and mount volume
- mkdir -p /opt/storage/data
- mkfs.ext4 -F /dev/sdb
- mount /dev/sdb /opt/storage/data
- echo '/dev/sdb /opt/storage/data ext4 defaults 0 2' >> /etc/fstab
- chown -R fourlights:fourlights /opt/storage/data
# Enable Podman for user services
- loginctl enable-linger fourlights
- su - fourlights -c 'podman login ghcr.io -u ${ghcr_username} -p ${ghcr_token}'
# Enable and start services
- systemctl daemon-reload
- systemctl enable --now haproxy
- systemctl enable --now dataplaneapi
- su - fourlights -c 'systemctl --user daemon-reload'
- su - fourlights -c 'systemctl --user enable --now podman-haproxy-acme-sync'
final_message: "Server setup complete with HAProxy, Podman, and ACME sync configured"