Files
ansible/check_raid.yml
martin.fencl eb9c56bb5e redo
2025-12-23 23:04:13 +01:00

160 lines
4.9 KiB
YAML

# check_raid.yml
- name: Check Linux MD RAID health on VM via Proxmox
hosts: linux_servers
gather_facts: false
become: true
become_user: root
become_method: sudo
vars:
# VM connection (provided by Semaphore env vars)
vm_ip: "{{ lookup('env', 'VM_IP') }}"
vm_user: "{{ lookup('env', 'VM_USER') }}"
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
use_sudo: false
# --- Debug mode (controlled via Semaphore variable) ---
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
# --- RAID specifics ---
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
# 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
# Retry policy
raid_retries: "{{ RETRIES }}"
raid_delay: 2
# SSH options (keeps same style as your working playbooks, but avoids auth prompts)
ssh_opts:
- "-o"
- "StrictHostKeyChecking=no"
- "-o"
- "UserKnownHostsFile=/dev/null"
- "-o"
- "ConnectTimeout=15"
- "-o"
- "PreferredAuthentications=password"
- "-o"
- "PubkeyAuthentication=no"
- "-o"
- "KbdInteractiveAuthentication=no"
- "-o"
- "NumberOfPasswordPrompts=1"
# Commands to run on the target VM
raid_commands:
- |
python3 - <<'PY'
# Parse /proc/mdstat and validate MD RAID state
import re, sys
md = "{{ raid_md_device }}"
allow_sync = int("{{ raid_allow_sync }}")
try:
txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
except Exception as e:
print(f"ERROR: cannot read /proc/mdstat: {e}")
sys.exit(2)
# Find token like [UU] / [U_] for the selected md device
# Example lines:
# md0 : active raid1 sdb1[0] sdc1[1]
# 11718751232 blocks super 1.2 [2/2] [UU]
pat = re.compile(
rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]",
re.MULTILINE
)
m = pat.search(txt)
if not m:
print(f"ERROR: {md} not found in /proc/mdstat")
print(txt.strip())
sys.exit(2)
token = m.group(1)
syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
degraded = "_" in token
print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
if degraded:
sys.exit(1)
if syncing and not allow_sync:
sys.exit(1)
sys.exit(0)
PY
tasks:
- name: Ensure sshpass is installed (for password-based SSH) # English comments
ansible.builtin.apt:
name: sshpass
state: present
update_cache: yes
- name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label
ansible.builtin.command:
argv:
- sshpass
- -e
- ssh
- "{{ ssh_opts[0] }}"
- "{{ ssh_opts[1] }}"
- "{{ ssh_opts[2] }}"
- "{{ ssh_opts[3] }}"
- "{{ ssh_opts[4] }}"
- "{{ ssh_opts[5] }}"
- "{{ ssh_opts[6] }}"
- "{{ ssh_opts[7] }}"
- "{{ ssh_opts[8] }}"
- "{{ ssh_opts[9] }}"
- "{{ ssh_opts[10] }}"
- "{{ ssh_opts[11] }}"
- "{{ ssh_opts[12] }}"
- "{{ ssh_opts[13] }}"
- "{{ ssh_opts[14] }}"
- "{{ ssh_opts[15] }}"
- "{{ vm_user }}@{{ vm_ip }}"
- bash
- -lc
- "{{ ('sudo ' if use_sudo else '') + item }}"
environment:
SSHPASS: "{{ vm_pass }}"
loop: "{{ raid_commands }}"
loop_control:
index_var: idx
label: "cmd-{{ idx }}"
register: raid_cmds
changed_when: false
failed_when: false # do not stop early; assert below decides
no_log: "{{ DEBUG == 0 }}"
retries: "{{ raid_retries }}"
delay: "{{ raid_delay }}"
until: raid_cmds is succeeded # command executed (rc can be non-zero; we handle later)
- name: Show outputs for each RAID command
ansible.builtin.debug:
msg: |
CMD: {{ item.item }}
RC: {{ item.rc }}
STDOUT:
{{ (item.stdout | default('')).strip() }}
STDERR:
{{ (item.stderr | default('')).strip() }}
loop: "{{ raid_cmds.results }}"
when: DEBUG == 1
- name: Fail play if RAID check failed # English comments
ansible.builtin.assert:
that: "item.rc == 0"
fail_msg: "RAID check failed on VM: {{ item.stdout | default(item.stderr) | default('no output') }}"
success_msg: "RAID check OK."
loop: "{{ raid_cmds.results }}"
loop_control:
index_var: idx
label: "cmd-{{ idx }}"