Files
ansible/check_raid.yml
martin.fencl 3c7701a760 add check raid
2025-12-23 20:01:45 +01:00

157 lines
6.0 KiB
YAML

# check_raid.yml
- name: Check mdadm RAID status on VM (via SSH from Semaphore host)
hosts: linux_servers
gather_facts: false
become: true
become_user: root
become_method: sudo
vars:
# VM connection (provided by Semaphore env vars)
vm_ip: "{{ lookup('env', 'VM_IP') }}"
vm_user: "{{ lookup('env', 'VM_USER') }}"
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
use_sudo: false
# Debug / behavior toggles (controlled via Semaphore variables)
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
# Which md device to check (md0, md1, ... or /dev/md0)
RAID_DEVICE: "{{ lookup('env', 'RAID_DEVICE') | default('md0', true) }}"
# Fail the job if rebuild/resync is in progress (recommended)
FAIL_ON_RESYNC: "{{ lookup('env', 'FAIL_ON_RESYNC') | default(1) | int }}"
# Fail the job if a periodic "check" is running (usually set 0 to avoid noise)
FAIL_ON_CHECK: "{{ lookup('env', 'FAIL_ON_CHECK') | default(0) | int }}"
tasks:
- name: Ensure sshpass is installed (for password-based SSH) # English comments
ansible.builtin.apt:
name: sshpass
state: present
update_cache: yes
cache_valid_time: 86400
- name: Read /proc/mdstat from VM (via SSH) # English comments
ansible.builtin.command:
argv:
- sshpass
- -e
- ssh
- -o
- StrictHostKeyChecking=no
- -o
- ConnectTimeout=15
- "{{ vm_user }}@{{ vm_ip }}"
- bash
- -lc
- "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}"
environment:
SSHPASS: "{{ vm_pass }}"
register: mdstat_cmd
changed_when: false
failed_when: false
no_log: "{{ DEBUG == 0 }}" # hides stdout/stderr in normal mode, but asserts below will still report summary
- name: Fail if we cannot read /proc/mdstat (SSH/auth/network) # English comments
ansible.builtin.assert:
that:
- mdstat_cmd.rc == 0
fail_msg: >-
Cannot read /proc/mdstat from {{ vm_ip }} (rc={{ mdstat_cmd.rc }}).
stderr={{ (mdstat_cmd.stderr | default('') | trim) }}
success_msg: "Successfully read /proc/mdstat from {{ vm_ip }}."
changed_when: false
- name: Parse RAID status from mdstat # English comments
ansible.builtin.set_fact:
raid_md: "{{ RAID_DEVICE | regex_replace('^/dev/', '') }}"
mdstat_text: "{{ mdstat_cmd.stdout | default('') }}"
raid_present: "{{ (mdstat_cmd.stdout | default('')) is search('(?m)^' ~ (RAID_DEVICE | regex_replace('^/dev/','')) ~ '\\s*:') }}"
raid_status: >-
{{
(mdstat_cmd.stdout | default('')) |
regex_search(
'(?ms)^' ~ (RAID_DEVICE | regex_replace('^/dev/','')) ~ '\\s*:.*?\\n.*?\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]',
'\\1'
) | default('')
}}
raid_is_degraded: "{{ (raid_status | length > 0) and ('_' in raid_status) }}"
raid_is_rebuilding: "{{ (mdstat_cmd.stdout | default('')) is search('(?i)\\b(resync|recovery|reshape|repair)\\b') }}"
raid_is_checking: "{{ (mdstat_cmd.stdout | default('')) is search('(?i)\\bcheck\\b') }}"
raid_action_line: >-
{{
(mdstat_cmd.stdout | default('')) |
regex_search(
'(?im)^(\\s*\\[[^\\]]+\\].*\\b(resync|recovery|reshape|repair|check)\\b.*)$',
'\\1'
) | default('')
}}
raid_progress: >-
{{
(mdstat_cmd.stdout | default('')) |
regex_search('(?i)\\b(resync|recovery|reshape|repair|check)\\b\\s*=\\s*([0-9.]+)%', '\\2') |
default('')
}}
changed_when: false
- name: Debug | Show mdstat and parsed values # English comments
ansible.builtin.debug:
msg: |
--- /proc/mdstat ---
{{ mdstat_text }}
--- Parsed ---
raid_md={{ raid_md }}
raid_present={{ raid_present }}
raid_status={{ raid_status }}
raid_is_degraded={{ raid_is_degraded }}
raid_is_rebuilding={{ raid_is_rebuilding }}
raid_is_checking={{ raid_is_checking }}
raid_progress={{ raid_progress }}
raid_action_line={{ raid_action_line }}
when: DEBUG == 1
- name: Fail if RAID device is not present # English comments
ansible.builtin.assert:
that:
- raid_present
fail_msg: "RAID {{ raid_md }} not found in /proc/mdstat on {{ vm_ip }}."
success_msg: "RAID {{ raid_md }} found in /proc/mdstat."
changed_when: false
- name: Fail if RAID is degraded (missing member) # English comments
ansible.builtin.assert:
that:
- not raid_is_degraded
fail_msg: >-
RAID {{ raid_md }} is DEGRADED: status={{ raid_status }}.
mdstat excerpt={{ (mdstat_text | trim) }}
success_msg: "RAID {{ raid_md }} is OK: status={{ raid_status }}."
changed_when: false
- name: Fail if RAID rebuild/resync is in progress (optional) # English comments
ansible.builtin.assert:
that:
- not raid_is_rebuilding
fail_msg: >-
RAID {{ raid_md }} is rebuilding/resyncing.
{{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }}
line={{ raid_action_line | default('n/a') }}
success_msg: "RAID {{ raid_md }} is not rebuilding/resyncing."
when: FAIL_ON_RESYNC | int == 1
changed_when: false
- name: Fail if RAID check is in progress (optional) # English comments
ansible.builtin.assert:
that:
- not raid_is_checking
fail_msg: >-
RAID {{ raid_md }} is running a periodic check.
{{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }}
line={{ raid_action_line | default('n/a') }}
success_msg: "RAID {{ raid_md }} is not running a periodic check."
when: FAIL_ON_CHECK | int == 1
changed_when: false