ansible/check_raid.yml

# check_raid.yml

- name: Check Linux MD RAID health on VM via Proxmox
  hosts: linux_servers
  gather_facts: false
  become: true
  become_user: root
  become_method: sudo

  vars:
    # VM connection (provided by Semaphore env vars)
    vm_ip:   "{{ lookup('env', 'VM_IP') }}"
    vm_user: "{{ lookup('env', 'VM_USER') }}"
    vm_pass: "{{ lookup('env', 'VM_PASS') }}"
    use_sudo: false

    # --- Debug mode (controlled via Semaphore variable) ---
    DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
    RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"

    # --- RAID specifics ---
    raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
    # 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected
    raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"

    # Retry policy
    raid_retries: "{{ RETRIES }}"
    raid_delay: 2

    # SSH options (keeps same style as your working playbooks, but avoids auth prompts)
    ssh_opts:
      - "-o"
      - "StrictHostKeyChecking=no"
      - "-o"
      - "UserKnownHostsFile=/dev/null"
      - "-o"
      - "ConnectTimeout=15"
      - "-o"
      - "PreferredAuthentications=password"
      - "-o"
      - "PubkeyAuthentication=no"
      - "-o"
      - "KbdInteractiveAuthentication=no"
      - "-o"
      - "NumberOfPasswordPrompts=1"

    # Commands to run on the target VM
    raid_commands:
      - |
        python3 - <<'PY'
        # Parse /proc/mdstat and validate MD RAID state
        import re, sys

        md = "{{ raid_md_device }}"
        allow_sync = int("{{ raid_allow_sync }}")

        try:
            txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
        except Exception as e:
            print(f"ERROR: cannot read /proc/mdstat: {e}")
            sys.exit(2)

        # Find token like [UU] / [U_] for the selected md device
        # Example lines:
        # md0 : active raid1 sdb1[0] sdc1[1]
        #       11718751232 blocks super 1.2 [2/2] [UU]
        pat = re.compile(
            rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]",
            re.MULTILINE
        )
        m = pat.search(txt)
        if not m:
            print(f"ERROR: {md} not found in /proc/mdstat")
            print(txt.strip())
            sys.exit(2)

        token = m.group(1)
        syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))

        degraded = "_" in token
        print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")

        if degraded:
            sys.exit(1)

        if syncing and not allow_sync:
            sys.exit(1)

        sys.exit(0)
        PY

  tasks:
    - name: Ensure sshpass is installed (for password-based SSH)  # English comments
      ansible.builtin.apt:
        name: sshpass
        state: present
        update_cache: yes

    - name: Run RAID check commands on VM (via SSH)  # use SSHPASS env, hide item label
      ansible.builtin.command:
        argv:
          - sshpass
          - -e
          - ssh
          - "{{ ssh_opts[0] }}"
          - "{{ ssh_opts[1] }}"
          - "{{ ssh_opts[2] }}"
          - "{{ ssh_opts[3] }}"
          - "{{ ssh_opts[4] }}"
          - "{{ ssh_opts[5] }}"
          - "{{ ssh_opts[6] }}"
          - "{{ ssh_opts[7] }}"
          - "{{ ssh_opts[8] }}"
          - "{{ ssh_opts[9] }}"
          - "{{ ssh_opts[10] }}"
          - "{{ ssh_opts[11] }}"
          - "{{ ssh_opts[12] }}"
          - "{{ ssh_opts[13] }}"
          - "{{ ssh_opts[14] }}"
          - "{{ ssh_opts[15] }}"
          - "{{ vm_user }}@{{ vm_ip }}"
          - bash
          - -lc
          - "{{ ('sudo ' if use_sudo else '') + item }}"
      environment:
        SSHPASS: "{{ vm_pass }}"
      loop: "{{ raid_commands }}"
      loop_control:
        index_var: idx
        label: "cmd-{{ idx }}"
      register: raid_cmds
      changed_when: false
      failed_when: false            # do not stop early; assert below decides
      no_log: "{{ DEBUG == 0 }}"
      retries: "{{ raid_retries }}"
      delay: "{{ raid_delay }}"
      until: raid_cmds is succeeded  # command executed (rc can be non-zero; we handle later)

    - name: Show outputs for each RAID command
      ansible.builtin.debug:
        msg: |
          CMD: {{ item.item }}
          RC:  {{ item.rc }}
          STDOUT:
          {{ (item.stdout | default('')).strip() }}
          STDERR:
          {{ (item.stderr | default('')).strip() }}
      loop: "{{ raid_cmds.results }}"
      when: DEBUG == 1

    - name: Fail play if RAID check failed  # English comments
      ansible.builtin.assert:
        that: "item.rc == 0"
        fail_msg: "RAID check failed on VM: {{ item.stdout | default(item.stderr) | default('no output') }}"
        success_msg: "RAID check OK."
      loop: "{{ raid_cmds.results }}"
      loop_control:
        index_var: idx
        label: "cmd-{{ idx }}"