# check_raid.yml - name: Check Linux MD RAID health on VM via Proxmox hosts: linux_servers gather_facts: false become: true become_user: root become_method: sudo vars: # VM connection (provided by Semaphore env vars) vm_ip: "{{ lookup('env', 'VM_IP') }}" vm_user: "{{ lookup('env', 'VM_USER') }}" vm_pass: "{{ lookup('env', 'VM_PASS') }}" use_sudo: false # Debug mode DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}" # RAID specifics # RAID_MD can be: md0 / md1 / ... OR "auto" to check all arrays found in /proc/mdstat raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}" raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}" raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}" raid_retries: "{{ RETRIES }}" raid_delay: 2 ssh_hard_timeout: 30 # SSH options ssh_opts: - "-o" # English comments - "StrictHostKeyChecking=no" - "-o" - "UserKnownHostsFile=/dev/null" - "-o" - "GlobalKnownHostsFile=/dev/null" - "-o" - "LogLevel=ERROR" - "-o" - "ConnectTimeout=15" - "-o" - "PreferredAuthentications=password" - "-o" - "PubkeyAuthentication=no" - "-o" - "KbdInteractiveAuthentication=no" - "-o" - "NumberOfPasswordPrompts=1" raid_check_cmd: | python3 - <<'PY' # Print exactly one status line and exit with code: # 0=OK, 1=FAIL (degraded/disallowed sync), 2=ERROR (unexpected/misconfig) import re, sys target = "{{ raid_md_device }}" allow_sync = int("{{ raid_allow_sync }}") allow_no_array = int("{{ raid_allow_no_array }}") try: txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read() except Exception as e: print(f"ERROR RAID read_mdstat err={e}") sys.exit(2) arrays = {} header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE) token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE) for m in header_re.finditer(txt): name = m.group(1) chunk = txt[m.end():m.end() + 3000] tm = token_re.search(chunk) if tm: arrays[name] = tm.group(1) if not arrays: if allow_no_array: print("OK RAID none=no-md-arrays") sys.exit(0) print("ERROR RAID none=no-md-arrays") sys.exit(2) syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt)) if target == "auto": to_check = sorted(arrays.keys()) else: if target not in arrays: found = ",".join(sorted(arrays.keys())) print(f"ERROR RAID target_not_found target={target} found={found}") sys.exit(2) to_check = [target] tokens_str = " ".join([f"{name}=[{arrays[name]}]" for name in to_check]) degraded = any("_" in arrays[name] for name in to_check) if degraded: print(f"FAIL RAID {tokens_str} syncing={int(syncing)}") sys.exit(1) if syncing and not allow_sync: print(f"FAIL RAID {tokens_str} syncing={int(syncing)} allow_sync={allow_sync}") sys.exit(1) print(f"OK RAID {tokens_str} syncing={int(syncing)}") sys.exit(0) PY tasks: - name: Ensure sshpass is installed (for password-based SSH) # English comments ansible.builtin.apt: name: sshpass state: present update_cache: yes run_once: true - name: Run RAID check on VM (via SSH) # single command, no loop ansible.builtin.command: argv: >- {{ ['timeout', '-k', '5', (ssh_hard_timeout | string)] + ['sshpass', '-e', 'ssh'] + ssh_opts + [ vm_user ~ '@' ~ vm_ip, 'bash', '-lc', ('sudo ' if use_sudo else '') + raid_check_cmd ] }} environment: SSHPASS: "{{ vm_pass }}" register: raid_cmd changed_when: false failed_when: false # we decide via assert below retries: "{{ raid_retries }}" delay: "{{ raid_delay }}" until: raid_cmd.rc not in [124, 255] run_once: true - name: Build one-line summary (always) ansible.builtin.set_fact: raid_line: >- {{ (raid_cmd.stdout | default('') | trim) if ((raid_cmd.stdout | default('') | trim) | length) > 0 else ('ERROR RAID no-output rc=' ~ (raid_cmd.rc | string)) }} changed_when: false run_once: true - name: RAID result (always one line) ansible.builtin.assert: that: - raid_cmd.rc == 0 success_msg: "{{ raid_line }}" fail_msg: "{{ raid_line }}" run_once: true # Optional verbose debug - name: Debug | /proc/mdstat (VM) ansible.builtin.command: argv: >- {{ ['timeout', '-k', '5', (ssh_hard_timeout | string)] + ['sshpass', '-e', 'ssh'] + ssh_opts + [ vm_user ~ '@' ~ vm_ip, 'bash', '-lc', "cat /proc/mdstat" ] }} environment: SSHPASS: "{{ vm_pass }}" register: mdstat_dbg changed_when: false failed_when: false when: DEBUG == 1 run_once: true - name: Debug | mdstat output ansible.builtin.debug: msg: "{{ mdstat_dbg.stdout | default('') }}" when: DEBUG == 1 run_once: true