# check_raid.yml - name: Check Linux MD RAID health on VM via Proxmox hosts: linux_servers gather_facts: false become: true become_user: root become_method: sudo vars: # VM connection (provided by Semaphore env vars) vm_ip: "{{ lookup('env', 'VM_IP') }}" vm_user: "{{ lookup('env', 'VM_USER') }}" vm_pass: "{{ lookup('env', 'VM_PASS') }}" use_sudo: false # --- Debug mode (controlled via Semaphore variable) --- DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}" # --- RAID specifics --- raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}" # 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}" # Retry policy raid_retries: "{{ RETRIES }}" raid_delay: 2 # SSH options (keeps same style as your working playbooks, but avoids auth prompts) ssh_opts: - "-o" - "StrictHostKeyChecking=no" - "-o" - "UserKnownHostsFile=/dev/null" - "-o" - "ConnectTimeout=15" - "-o" - "PreferredAuthentications=password" - "-o" - "PubkeyAuthentication=no" - "-o" - "KbdInteractiveAuthentication=no" - "-o" - "NumberOfPasswordPrompts=1" # Commands to run on the target VM raid_commands: - | python3 - <<'PY' # Parse /proc/mdstat and validate MD RAID state import re, sys md = "{{ raid_md_device }}" allow_sync = int("{{ raid_allow_sync }}") try: txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read() except Exception as e: print(f"ERROR: cannot read /proc/mdstat: {e}") sys.exit(2) # Find token like [UU] / [U_] for the selected md device # Example lines: # md0 : active raid1 sdb1[0] sdc1[1] # 11718751232 blocks super 1.2 [2/2] [UU] pat = re.compile( rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]", re.MULTILINE ) m = pat.search(txt) if not m: print(f"ERROR: {md} not found in /proc/mdstat") print(txt.strip()) sys.exit(2) token = m.group(1) syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt)) degraded = "_" in token print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}") if degraded: sys.exit(1) if syncing and not allow_sync: sys.exit(1) sys.exit(0) PY tasks: - name: Ensure sshpass is installed (for password-based SSH) # English comments ansible.builtin.apt: name: sshpass state: present update_cache: yes - name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label ansible.builtin.command: argv: - sshpass - -e - ssh - "{{ ssh_opts[0] }}" - "{{ ssh_opts[1] }}" - "{{ ssh_opts[2] }}" - "{{ ssh_opts[3] }}" - "{{ ssh_opts[4] }}" - "{{ ssh_opts[5] }}" - "{{ ssh_opts[6] }}" - "{{ ssh_opts[7] }}" - "{{ ssh_opts[8] }}" - "{{ ssh_opts[9] }}" - "{{ ssh_opts[10] }}" - "{{ ssh_opts[11] }}" - "{{ ssh_opts[12] }}" - "{{ ssh_opts[13] }}" - "{{ ssh_opts[14] }}" - "{{ ssh_opts[15] }}" - "{{ vm_user }}@{{ vm_ip }}" - bash - -lc - "{{ ('sudo ' if use_sudo else '') + item }}" environment: SSHPASS: "{{ vm_pass }}" loop: "{{ raid_commands }}" loop_control: index_var: idx label: "cmd-{{ idx }}" register: raid_cmds changed_when: false failed_when: false # do not stop early; assert below decides no_log: "{{ DEBUG == 0 }}" retries: "{{ raid_retries }}" delay: "{{ raid_delay }}" until: raid_cmds is succeeded # command executed (rc can be non-zero; we handle later) - name: Show outputs for each RAID command ansible.builtin.debug: msg: | CMD: {{ item.item }} RC: {{ item.rc }} STDOUT: {{ (item.stdout | default('')).strip() }} STDERR: {{ (item.stderr | default('')).strip() }} loop: "{{ raid_cmds.results }}" when: DEBUG == 1 - name: Fail play if RAID check failed # English comments ansible.builtin.assert: that: "item.rc == 0" fail_msg: "RAID check failed on VM: {{ item.stdout | default(item.stderr) | default('no output') }}" success_msg: "RAID check OK." loop: "{{ raid_cmds.results }}" loop_control: index_var: idx label: "cmd-{{ idx }}"