146 lines
4.5 KiB
YAML
146 lines
4.5 KiB
YAML
# check_raid.yml
|
|
|
|
- name: Check Linux MD RAID health on VM via Proxmox
|
|
hosts: linux_servers
|
|
gather_facts: false
|
|
become: true
|
|
become_user: root
|
|
become_method: sudo
|
|
|
|
vars:
|
|
# VM connection (provided by Semaphore env vars)
|
|
vm_ip: "{{ lookup('env', 'VM_IP') }}"
|
|
vm_user: "{{ lookup('env', 'VM_USER') }}"
|
|
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
|
|
use_sudo: false
|
|
|
|
# --- Debug mode (controlled via Semaphore variable) ---
|
|
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
|
|
RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
|
|
|
|
# --- RAID specifics ---
|
|
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
|
|
# 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected
|
|
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
|
|
|
|
# Retry policy
|
|
raid_retries: "{{ RETRIES }}"
|
|
raid_delay: 2
|
|
|
|
# Hard timeout for the whole SSH command (prevents hanging forever)
|
|
ssh_hard_timeout: 30
|
|
|
|
# SSH options (same style, but avoids auth prompts)
|
|
ssh_opts:
|
|
- "-o" # English comments
|
|
- "StrictHostKeyChecking=no"
|
|
- "-o"
|
|
- "UserKnownHostsFile=/dev/null"
|
|
- "-o"
|
|
- "ConnectTimeout=15"
|
|
- "-o"
|
|
- "PreferredAuthentications=password"
|
|
- "-o"
|
|
- "PubkeyAuthentication=no"
|
|
- "-o"
|
|
- "KbdInteractiveAuthentication=no"
|
|
- "-o"
|
|
- "NumberOfPasswordPrompts=1"
|
|
|
|
# Commands to run on the target VM
|
|
raid_commands:
|
|
- |
|
|
python3 - <<'PY'
|
|
# Parse /proc/mdstat and validate MD RAID state
|
|
import re, sys
|
|
|
|
md = "{{ raid_md_device }}"
|
|
allow_sync = int("{{ raid_allow_sync }}")
|
|
|
|
try:
|
|
txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
|
|
except Exception as e:
|
|
print(f"ERROR: cannot read /proc/mdstat: {e}")
|
|
sys.exit(2)
|
|
|
|
pat = re.compile(
|
|
rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]",
|
|
re.MULTILINE
|
|
)
|
|
m = pat.search(txt)
|
|
if not m:
|
|
print(f"ERROR: {md} not found in /proc/mdstat")
|
|
print(txt.strip())
|
|
sys.exit(2)
|
|
|
|
token = m.group(1)
|
|
syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
|
|
degraded = "_" in token
|
|
|
|
print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
|
|
|
|
if degraded:
|
|
sys.exit(1)
|
|
|
|
if syncing and not allow_sync:
|
|
sys.exit(1)
|
|
|
|
sys.exit(0)
|
|
PY
|
|
|
|
tasks:
|
|
- name: Ensure sshpass is installed (for password-based SSH) # English comments
|
|
ansible.builtin.apt:
|
|
name: sshpass
|
|
state: present
|
|
update_cache: yes
|
|
|
|
- name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label
|
|
ansible.builtin.command:
|
|
argv: >-
|
|
{{
|
|
['timeout', '-k', '5', (ssh_hard_timeout | string)]
|
|
+ ['sshpass', '-e', 'ssh']
|
|
+ ssh_opts
|
|
+ [ vm_user ~ '@' ~ vm_ip,
|
|
'bash', '-lc',
|
|
('sudo ' if use_sudo else '') + item
|
|
]
|
|
}}
|
|
environment:
|
|
SSHPASS: "{{ vm_pass }}"
|
|
loop: "{{ raid_commands }}"
|
|
loop_control:
|
|
index_var: idx
|
|
label: "cmd-{{ idx }}"
|
|
register: raid_cmds
|
|
changed_when: false
|
|
failed_when: false # do not stop early; assert below decides
|
|
no_log: "{{ DEBUG == 0 }}"
|
|
retries: "{{ raid_retries }}"
|
|
delay: "{{ raid_delay }}"
|
|
# Retry only on typical SSH/timeout failures (255=ssh error, 124=timeout)
|
|
until: (raid_cmds.results | selectattr('rc', 'in', [124, 255]) | list | length) == 0
|
|
|
|
- name: Show outputs for each RAID command
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
CMD: {{ item.item }}
|
|
RC: {{ item.rc }}
|
|
STDOUT:
|
|
{{ (item.stdout | default('')).strip() }}
|
|
STDERR:
|
|
{{ (item.stderr | default('')).strip() }}
|
|
loop: "{{ raid_cmds.results }}"
|
|
when: DEBUG == 1
|
|
|
|
- name: Fail play if RAID check failed # English comments
|
|
ansible.builtin.assert:
|
|
that: "item.rc == 0"
|
|
fail_msg: "RAID check failed on VM: {{ item.stdout | default(item.stderr) | default('no output') }}"
|
|
success_msg: "RAID check OK."
|
|
loop: "{{ raid_cmds.results }}"
|
|
loop_control:
|
|
index_var: idx
|
|
label: "cmd-{{ idx }}"
|