From eb9c56bb5e7f9c499d408ac29ec72bddcb89ac8f Mon Sep 17 00:00:00 2001 From: "martin.fencl" Date: Tue, 23 Dec 2025 23:04:13 +0100 Subject: [PATCH] redo --- check_raid.yml | 184 ++++++++++++++++++++++++++++++------------------- 1 file changed, 113 insertions(+), 71 deletions(-) diff --git a/check_raid.yml b/check_raid.yml index 6b7c59e..aff7a9b 100644 --- a/check_raid.yml +++ b/check_raid.yml @@ -14,19 +14,80 @@ vm_pass: "{{ lookup('env', 'VM_PASS') }}" use_sudo: false - # Debug / retries + # --- Debug mode (controlled via Semaphore variable) --- DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" - RETRIES: "{{ lookup('env', 'RETRIES') | default(10) | int }}" + RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}" - # RAID device to check (e.g. md0, md1...) + # --- RAID specifics --- raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}" - - # If 0 => fail when resync/recovery/reshape/check/repair is detected in /proc/mdstat - # If 1 => allow sync operations (still fails only on degraded [U_], [_U], etc.) + # 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}" - # SSH options - ssh_connect_timeout: 15 + # Retry policy + raid_retries: "{{ RETRIES }}" + raid_delay: 2 + + # SSH options (keeps same style as your working playbooks, but avoids auth prompts) + ssh_opts: + - "-o" + - "StrictHostKeyChecking=no" + - "-o" + - "UserKnownHostsFile=/dev/null" + - "-o" + - "ConnectTimeout=15" + - "-o" + - "PreferredAuthentications=password" + - "-o" + - "PubkeyAuthentication=no" + - "-o" + - "KbdInteractiveAuthentication=no" + - "-o" + - "NumberOfPasswordPrompts=1" + + # Commands to run on the target VM + raid_commands: + - | + python3 - <<'PY' + # Parse /proc/mdstat and validate MD RAID state + import re, sys + + md = "{{ raid_md_device }}" + allow_sync = int("{{ raid_allow_sync }}") + + try: + txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read() + except Exception as e: + print(f"ERROR: cannot read /proc/mdstat: {e}") + sys.exit(2) + + # Find token like [UU] / [U_] for the selected md device + # Example lines: + # md0 : active raid1 sdb1[0] sdc1[1] + # 11718751232 blocks super 1.2 [2/2] [UU] + pat = re.compile( + rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]", + re.MULTILINE + ) + m = pat.search(txt) + if not m: + print(f"ERROR: {md} not found in /proc/mdstat") + print(txt.strip()) + sys.exit(2) + + token = m.group(1) + syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt)) + + degraded = "_" in token + print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}") + + if degraded: + sys.exit(1) + + if syncing and not allow_sync: + sys.exit(1) + + sys.exit(0) + PY tasks: - name: Ensure sshpass is installed (for password-based SSH) # English comments @@ -35,83 +96,64 @@ state: present update_cache: yes - - name: Read /proc/mdstat from VM (via SSH) # English comments + - name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label ansible.builtin.command: argv: - sshpass - -e - ssh - - -o - - StrictHostKeyChecking=no - - -o - - "ConnectTimeout={{ ssh_connect_timeout }}" + - "{{ ssh_opts[0] }}" + - "{{ ssh_opts[1] }}" + - "{{ ssh_opts[2] }}" + - "{{ ssh_opts[3] }}" + - "{{ ssh_opts[4] }}" + - "{{ ssh_opts[5] }}" + - "{{ ssh_opts[6] }}" + - "{{ ssh_opts[7] }}" + - "{{ ssh_opts[8] }}" + - "{{ ssh_opts[9] }}" + - "{{ ssh_opts[10] }}" + - "{{ ssh_opts[11] }}" + - "{{ ssh_opts[12] }}" + - "{{ ssh_opts[13] }}" + - "{{ ssh_opts[14] }}" + - "{{ ssh_opts[15] }}" - "{{ vm_user }}@{{ vm_ip }}" - bash - -lc - - "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}" + - "{{ ('sudo ' if use_sudo else '') + item }}" environment: SSHPASS: "{{ vm_pass }}" - register: mdstat_raw + loop: "{{ raid_commands }}" + loop_control: + index_var: idx + label: "cmd-{{ idx }}" + register: raid_cmds changed_when: false + failed_when: false # do not stop early; assert below decides no_log: "{{ DEBUG == 0 }}" - retries: "{{ RETRIES }}" - delay: 2 - until: mdstat_raw.rc == 0 + retries: "{{ raid_retries }}" + delay: "{{ raid_delay }}" + until: raid_cmds is succeeded # command executed (rc can be non-zero; we handle later) - - name: Debug | Show raw /proc/mdstat # English comments + - name: Show outputs for each RAID command ansible.builtin.debug: - msg: "{{ mdstat_raw.stdout }}" + msg: | + CMD: {{ item.item }} + RC: {{ item.rc }} + STDOUT: + {{ (item.stdout | default('')).strip() }} + STDERR: + {{ (item.stderr | default('')).strip() }} + loop: "{{ raid_cmds.results }}" when: DEBUG == 1 - - name: Extract RAID status token for selected MD device # English comments - ansible.builtin.set_fact: - raid_token: >- - {{ - (mdstat_raw.stdout | regex_search( - raid_md_device ~ '\\s*:\\s*active.*\\n\\s*\\d+\\s+blocks.*\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]', - '\\1', - multiline=True - )) | default('') - }} - - - name: Detect sync operations in mdstat (resync/recovery/reshape/check/repair) # English comments - ansible.builtin.set_fact: - raid_syncing: "{{ (mdstat_raw.stdout is search('resync|recovery|reshape|check|repair')) | bool }}" - - - name: Compute degraded flag (underscore means missing member) # English comments - ansible.builtin.set_fact: - raid_degraded: "{{ (raid_token | length > 0) and ('_' in raid_token) }}" - - - name: Fail if MD device not found in /proc/mdstat # English comments + - name: Fail play if RAID check failed # English comments ansible.builtin.assert: - that: - - raid_token | length > 0 - fail_msg: "RAID device {{ raid_md_device }} was not found in /proc/mdstat on VM ({{ vm_ip }})." - success_msg: "RAID device {{ raid_md_device }} found in /proc/mdstat." - changed_when: false - - - name: Fail if RAID is degraded (token contains '_') # English comments - ansible.builtin.assert: - that: - - not raid_degraded - fail_msg: "RAID {{ raid_md_device }} is DEGRADED: token=[{{ raid_token }}] (expected all 'U')." - success_msg: "RAID {{ raid_md_device }} is OK: token=[{{ raid_token }}]." - changed_when: false - - - name: Fail if RAID is syncing and syncing is not allowed # English comments - ansible.builtin.assert: - that: - - (raid_allow_sync | int) == 1 or (not raid_syncing) - fail_msg: "RAID {{ raid_md_device }} is running a sync operation (resync/recovery/reshape/check/repair) and RAID_ALLOW_SYNC=0." - success_msg: "No sync operation detected (or RAID_ALLOW_SYNC=1)." - changed_when: false - - - name: Print concise summary (debug) # English comments - ansible.builtin.debug: - msg: >- - RAID={{ raid_md_device }}, - token=[{{ raid_token }}], - degraded={{ raid_degraded }}, - syncing={{ raid_syncing }}, - allow_sync={{ raid_allow_sync }} - when: DEBUG == 1 + that: "item.rc == 0" + fail_msg: "RAID check failed on VM: {{ item.stdout | default(item.stderr) | default('no output') }}" + success_msg: "RAID check OK." + loop: "{{ raid_cmds.results }}" + loop_control: + index_var: idx + label: "cmd-{{ idx }}"