From e47ccb64b72a5dec83de7b1a5a60188d0d7adf68 Mon Sep 17 00:00:00 2001 From: "martin.fencl" Date: Tue, 23 Dec 2025 23:38:23 +0100 Subject: [PATCH] redo --- check_raid.yml | 172 ++++++++++++++++++++++++++++--------------------- 1 file changed, 97 insertions(+), 75 deletions(-) diff --git a/check_raid.yml b/check_raid.yml index 7ef9e35..12a00ff 100644 --- a/check_raid.yml +++ b/check_raid.yml @@ -8,14 +8,18 @@ become_method: sudo vars: + # VM connection (provided by Semaphore env vars) vm_ip: "{{ lookup('env', 'VM_IP') }}" vm_user: "{{ lookup('env', 'VM_USER') }}" vm_pass: "{{ lookup('env', 'VM_PASS') }}" use_sudo: false + # Debug mode DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}" + # RAID specifics + # RAID_MD can be: md0 / md1 / ... OR "auto" to check all arrays found in /proc/mdstat raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}" raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}" raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}" @@ -24,6 +28,7 @@ raid_delay: 2 ssh_hard_timeout: 30 + # SSH options ssh_opts: - "-o" # English comments - "StrictHostKeyChecking=no" @@ -44,64 +49,65 @@ - "-o" - "NumberOfPasswordPrompts=1" - raid_commands: - - | - python3 - <<'PY' - # Parse /proc/mdstat and validate MD RAID state - import re, sys + raid_check_cmd: | + python3 - <<'PY' + # Print exactly one status line and exit with code: + # 0=OK, 1=FAIL (degraded/disallowed sync), 2=ERROR (unexpected/misconfig) + import re, sys - target = "{{ raid_md_device }}" - allow_sync = int("{{ raid_allow_sync }}") - allow_no_array = int("{{ raid_allow_no_array }}") + target = "{{ raid_md_device }}" + allow_sync = int("{{ raid_allow_sync }}") + allow_no_array = int("{{ raid_allow_no_array }}") - try: - txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read() - except Exception as e: - print(f"ERROR: cannot read /proc/mdstat: {e}") - sys.exit(2) + try: + txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read() + except Exception as e: + print(f"ERROR RAID read_mdstat err={e}") + sys.exit(2) - arrays = {} - header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE) - token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE) + arrays = {} + header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE) + token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE) - for m in header_re.finditer(txt): - name = m.group(1) - chunk = txt[m.end():m.end() + 3000] - tm = token_re.search(chunk) - if tm: - arrays[name] = tm.group(1) + for m in header_re.finditer(txt): + name = m.group(1) + chunk = txt[m.end():m.end() + 3000] + tm = token_re.search(chunk) + if tm: + arrays[name] = tm.group(1) - if not arrays: - print("NO_MD_ARRAYS: /proc/mdstat contains no active md arrays.") - print(txt.strip()) - sys.exit(0 if allow_no_array else 2) + if not arrays: + if allow_no_array: + print("OK RAID none=no-md-arrays") + sys.exit(0) + print("ERROR RAID none=no-md-arrays") + sys.exit(2) - syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt)) + syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt)) - if target == "auto": - to_check = sorted(arrays.keys()) - else: - if target not in arrays: - print(f"ERROR: {target} not found in /proc/mdstat. Found={sorted(arrays.keys())}") - print(txt.strip()) - sys.exit(2) - to_check = [target] + if target == "auto": + to_check = sorted(arrays.keys()) + else: + if target not in arrays: + found = ",".join(sorted(arrays.keys())) + print(f"ERROR RAID target_not_found target={target} found={found}") + sys.exit(2) + to_check = [target] - any_degraded = False - for name in to_check: - token = arrays[name] - degraded = "_" in token - any_degraded = any_degraded or degraded - print(f"RAID={name} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}") + tokens_str = " ".join([f"{name}=[{arrays[name]}]" for name in to_check]) + degraded = any("_" in arrays[name] for name in to_check) - if any_degraded: - sys.exit(1) + if degraded: + print(f"FAIL RAID {tokens_str} syncing={int(syncing)}") + sys.exit(1) - if syncing and not allow_sync: - sys.exit(1) + if syncing and not allow_sync: + print(f"FAIL RAID {tokens_str} syncing={int(syncing)} allow_sync={allow_sync}") + sys.exit(1) - sys.exit(0) - PY + print(f"OK RAID {tokens_str} syncing={int(syncing)}") + sys.exit(0) + PY tasks: - name: Ensure sshpass is installed (for password-based SSH) # English comments @@ -109,8 +115,9 @@ name: sshpass state: present update_cache: yes + run_once: true - - name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label + - name: Run RAID check on VM (via SSH) # single command, no loop ansible.builtin.command: argv: >- {{ @@ -119,43 +126,58 @@ + ssh_opts + [ vm_user ~ '@' ~ vm_ip, 'bash', '-lc', - ('sudo ' if use_sudo else '') + item + ('sudo ' if use_sudo else '') + raid_check_cmd ] }} environment: SSHPASS: "{{ vm_pass }}" - loop: "{{ raid_commands }}" - loop_control: - index_var: idx - label: "cmd-{{ idx }}" - register: raid_cmds + register: raid_cmd changed_when: false - failed_when: false - no_log: "{{ DEBUG == 0 }}" + failed_when: false # we decide via assert below retries: "{{ raid_retries }}" delay: "{{ raid_delay }}" - until: raid_cmds.rc not in [124, 255] + until: raid_cmd.rc not in [124, 255] run_once: true - - name: Show outputs for each RAID command - ansible.builtin.debug: - msg: | - RC: {{ item.rc }} - STDOUT: - {{ (item.stdout | default('')).strip() }} - STDERR: - {{ (item.stderr | default('')).strip() }} - loop: "{{ (raid_cmds.results if (raid_cmds.results is defined) else [raid_cmds]) }}" + - name: Build one-line summary (always) + ansible.builtin.set_fact: + raid_line: >- + {{ + (raid_cmd.stdout | default('') | trim) + if ((raid_cmd.stdout | default('') | trim) | length) > 0 + else ('ERROR RAID no-output rc=' ~ (raid_cmd.rc | string)) + }} + changed_when: false + run_once: true + + - name: RAID result (always one line) + ansible.builtin.assert: + that: + - raid_cmd.rc == 0 + success_msg: "{{ raid_line }}" + fail_msg: "{{ raid_line }}" + run_once: true + + # Optional verbose debug + - name: Debug | /proc/mdstat (VM) + ansible.builtin.command: + argv: >- + {{ + ['timeout', '-k', '5', (ssh_hard_timeout | string)] + + ['sshpass', '-e', 'ssh'] + + ssh_opts + + [ vm_user ~ '@' ~ vm_ip, 'bash', '-lc', "cat /proc/mdstat" ] + }} + environment: + SSHPASS: "{{ vm_pass }}" + register: mdstat_dbg + changed_when: false + failed_when: false when: DEBUG == 1 run_once: true - - name: Fail play if RAID check failed # English comments - ansible.builtin.assert: - that: "item.rc == 0" - fail_msg: "RAID check failed on VM: {{ (item.stdout | default(item.stderr) | default('no output')) | trim }}" - success_msg: "RAID check OK." - loop: "{{ (raid_cmds.results if (raid_cmds.results is defined) else [raid_cmds]) }}" - loop_control: - index_var: idx - label: "cmd-{{ idx }}" + - name: Debug | mdstat output + ansible.builtin.debug: + msg: "{{ mdstat_dbg.stdout | default('') }}" + when: DEBUG == 1 run_once: true