From 3c7701a7603e6b4c0c7b828512f892235403b4eb Mon Sep 17 00:00:00 2001 From: "martin.fencl" Date: Tue, 23 Dec 2025 20:01:45 +0100 Subject: [PATCH] add check raid --- check_raid.yml | 156 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 check_raid.yml diff --git a/check_raid.yml b/check_raid.yml new file mode 100644 index 0000000..c52b952 --- /dev/null +++ b/check_raid.yml @@ -0,0 +1,156 @@ +# check_raid.yml + +- name: Check mdadm RAID status on VM (via SSH from Semaphore host) + hosts: linux_servers + gather_facts: false + become: true + become_user: root + become_method: sudo + + vars: + # VM connection (provided by Semaphore env vars) + vm_ip: "{{ lookup('env', 'VM_IP') }}" + vm_user: "{{ lookup('env', 'VM_USER') }}" + vm_pass: "{{ lookup('env', 'VM_PASS') }}" + use_sudo: false + + # Debug / behavior toggles (controlled via Semaphore variables) + DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" + + # Which md device to check (md0, md1, ... or /dev/md0) + RAID_DEVICE: "{{ lookup('env', 'RAID_DEVICE') | default('md0', true) }}" + + # Fail the job if rebuild/resync is in progress (recommended) + FAIL_ON_RESYNC: "{{ lookup('env', 'FAIL_ON_RESYNC') | default(1) | int }}" + + # Fail the job if a periodic "check" is running (usually set 0 to avoid noise) + FAIL_ON_CHECK: "{{ lookup('env', 'FAIL_ON_CHECK') | default(0) | int }}" + + tasks: + - name: Ensure sshpass is installed (for password-based SSH) # English comments + ansible.builtin.apt: + name: sshpass + state: present + update_cache: yes + cache_valid_time: 86400 + + - name: Read /proc/mdstat from VM (via SSH) # English comments + ansible.builtin.command: + argv: + - sshpass + - -e + - ssh + - -o + - StrictHostKeyChecking=no + - -o + - ConnectTimeout=15 + - "{{ vm_user }}@{{ vm_ip }}" + - bash + - -lc + - "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}" + environment: + SSHPASS: "{{ vm_pass }}" + register: mdstat_cmd + changed_when: false + failed_when: false + no_log: "{{ DEBUG == 0 }}" # hides stdout/stderr in normal mode, but asserts below will still report summary + + - name: Fail if we cannot read /proc/mdstat (SSH/auth/network) # English comments + ansible.builtin.assert: + that: + - mdstat_cmd.rc == 0 + fail_msg: >- + Cannot read /proc/mdstat from {{ vm_ip }} (rc={{ mdstat_cmd.rc }}). + stderr={{ (mdstat_cmd.stderr | default('') | trim) }} + success_msg: "Successfully read /proc/mdstat from {{ vm_ip }}." + changed_when: false + + - name: Parse RAID status from mdstat # English comments + ansible.builtin.set_fact: + raid_md: "{{ RAID_DEVICE | regex_replace('^/dev/', '') }}" + mdstat_text: "{{ mdstat_cmd.stdout | default('') }}" + raid_present: "{{ (mdstat_cmd.stdout | default('')) is search('(?m)^' ~ (RAID_DEVICE | regex_replace('^/dev/','')) ~ '\\s*:') }}" + raid_status: >- + {{ + (mdstat_cmd.stdout | default('')) | + regex_search( + '(?ms)^' ~ (RAID_DEVICE | regex_replace('^/dev/','')) ~ '\\s*:.*?\\n.*?\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]', + '\\1' + ) | default('') + }} + raid_is_degraded: "{{ (raid_status | length > 0) and ('_' in raid_status) }}" + raid_is_rebuilding: "{{ (mdstat_cmd.stdout | default('')) is search('(?i)\\b(resync|recovery|reshape|repair)\\b') }}" + raid_is_checking: "{{ (mdstat_cmd.stdout | default('')) is search('(?i)\\bcheck\\b') }}" + raid_action_line: >- + {{ + (mdstat_cmd.stdout | default('')) | + regex_search( + '(?im)^(\\s*\\[[^\\]]+\\].*\\b(resync|recovery|reshape|repair|check)\\b.*)$', + '\\1' + ) | default('') + }} + raid_progress: >- + {{ + (mdstat_cmd.stdout | default('')) | + regex_search('(?i)\\b(resync|recovery|reshape|repair|check)\\b\\s*=\\s*([0-9.]+)%', '\\2') | + default('') + }} + changed_when: false + + - name: Debug | Show mdstat and parsed values # English comments + ansible.builtin.debug: + msg: | + --- /proc/mdstat --- + {{ mdstat_text }} + --- Parsed --- + raid_md={{ raid_md }} + raid_present={{ raid_present }} + raid_status={{ raid_status }} + raid_is_degraded={{ raid_is_degraded }} + raid_is_rebuilding={{ raid_is_rebuilding }} + raid_is_checking={{ raid_is_checking }} + raid_progress={{ raid_progress }} + raid_action_line={{ raid_action_line }} + when: DEBUG == 1 + + - name: Fail if RAID device is not present # English comments + ansible.builtin.assert: + that: + - raid_present + fail_msg: "RAID {{ raid_md }} not found in /proc/mdstat on {{ vm_ip }}." + success_msg: "RAID {{ raid_md }} found in /proc/mdstat." + changed_when: false + + - name: Fail if RAID is degraded (missing member) # English comments + ansible.builtin.assert: + that: + - not raid_is_degraded + fail_msg: >- + RAID {{ raid_md }} is DEGRADED: status={{ raid_status }}. + mdstat excerpt={{ (mdstat_text | trim) }} + success_msg: "RAID {{ raid_md }} is OK: status={{ raid_status }}." + changed_when: false + + - name: Fail if RAID rebuild/resync is in progress (optional) # English comments + ansible.builtin.assert: + that: + - not raid_is_rebuilding + fail_msg: >- + RAID {{ raid_md }} is rebuilding/resyncing. + {{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }} + line={{ raid_action_line | default('n/a') }} + success_msg: "RAID {{ raid_md }} is not rebuilding/resyncing." + when: FAIL_ON_RESYNC | int == 1 + changed_when: false + + - name: Fail if RAID check is in progress (optional) # English comments + ansible.builtin.assert: + that: + - not raid_is_checking + fail_msg: >- + RAID {{ raid_md }} is running a periodic check. + {{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }} + line={{ raid_action_line | default('n/a') }} + success_msg: "RAID {{ raid_md }} is not running a periodic check." + when: FAIL_ON_CHECK | int == 1 + changed_when: false