Files
ansible/check_raid.yml
martin.fencl 1c6fb9c9c3 redo
2025-12-23 22:57:47 +01:00

118 lines
4.2 KiB
YAML

# check_raid.yml
- name: Check Linux MD RAID health on VM via Proxmox
hosts: linux_servers
gather_facts: false
become: true
become_user: root
become_method: sudo
vars:
# VM connection (provided by Semaphore env vars)
vm_ip: "{{ lookup('env', 'VM_IP') }}"
vm_user: "{{ lookup('env', 'VM_USER') }}"
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
use_sudo: false
# Debug / retries
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
RETRIES: "{{ lookup('env', 'RETRIES') | default(10) | int }}"
# RAID device to check (e.g. md0, md1...)
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
# If 0 => fail when resync/recovery/reshape/check/repair is detected in /proc/mdstat
# If 1 => allow sync operations (still fails only on degraded [U_], [_U], etc.)
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
# SSH options
ssh_connect_timeout: 15
tasks:
- name: Ensure sshpass is installed (for password-based SSH) # English comments
ansible.builtin.apt:
name: sshpass
state: present
update_cache: yes
- name: Read /proc/mdstat from VM (via SSH) # English comments
ansible.builtin.command:
argv:
- sshpass
- -e
- ssh
- -o
- StrictHostKeyChecking=no
- -o
- "ConnectTimeout={{ ssh_connect_timeout }}"
- "{{ vm_user }}@{{ vm_ip }}"
- bash
- -lc
- "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}"
environment:
SSHPASS: "{{ vm_pass }}"
register: mdstat_raw
changed_when: false
no_log: "{{ DEBUG == 0 }}"
retries: "{{ RETRIES }}"
delay: 2
until: mdstat_raw.rc == 0
- name: Debug | Show raw /proc/mdstat # English comments
ansible.builtin.debug:
msg: "{{ mdstat_raw.stdout }}"
when: DEBUG == 1
- name: Extract RAID status token for selected MD device # English comments
ansible.builtin.set_fact:
raid_token: >-
{{
(mdstat_raw.stdout | regex_search(
raid_md_device ~ '\\s*:\\s*active.*\\n\\s*\\d+\\s+blocks.*\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]',
'\\1',
multiline=True
)) | default('')
}}
- name: Detect sync operations in mdstat (resync/recovery/reshape/check/repair) # English comments
ansible.builtin.set_fact:
raid_syncing: "{{ (mdstat_raw.stdout is search('resync|recovery|reshape|check|repair')) | bool }}"
- name: Compute degraded flag (underscore means missing member) # English comments
ansible.builtin.set_fact:
raid_degraded: "{{ (raid_token | length > 0) and ('_' in raid_token) }}"
- name: Fail if MD device not found in /proc/mdstat # English comments
ansible.builtin.assert:
that:
- raid_token | length > 0
fail_msg: "RAID device {{ raid_md_device }} was not found in /proc/mdstat on VM ({{ vm_ip }})."
success_msg: "RAID device {{ raid_md_device }} found in /proc/mdstat."
changed_when: false
- name: Fail if RAID is degraded (token contains '_') # English comments
ansible.builtin.assert:
that:
- not raid_degraded
fail_msg: "RAID {{ raid_md_device }} is DEGRADED: token=[{{ raid_token }}] (expected all 'U')."
success_msg: "RAID {{ raid_md_device }} is OK: token=[{{ raid_token }}]."
changed_when: false
- name: Fail if RAID is syncing and syncing is not allowed # English comments
ansible.builtin.assert:
that:
- (raid_allow_sync | int) == 1 or (not raid_syncing)
fail_msg: "RAID {{ raid_md_device }} is running a sync operation (resync/recovery/reshape/check/repair) and RAID_ALLOW_SYNC=0."
success_msg: "No sync operation detected (or RAID_ALLOW_SYNC=1)."
changed_when: false
- name: Print concise summary (debug) # English comments
ansible.builtin.debug:
msg: >-
RAID={{ raid_md_device }},
token=[{{ raid_token }}],
degraded={{ raid_degraded }},
syncing={{ raid_syncing }},
allow_sync={{ raid_allow_sync }}
when: DEBUG == 1