From 74c3ef894555785dfba6efc7649a62e078cc1517 Mon Sep 17 00:00:00 2001 From: "martin.fencl" Date: Tue, 23 Dec 2025 23:14:47 +0100 Subject: [PATCH] redo --- check_raid.yml | 61 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/check_raid.yml b/check_raid.yml index 9b380ac..717e8df 100644 --- a/check_raid.yml +++ b/check_raid.yml @@ -19,9 +19,12 @@ RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}" # --- RAID specifics --- + # RAID_MD can be: md0 / md1 / ... OR "auto" to check all arrays found in /proc/mdstat raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}" # 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}" + # 1 = do not fail when no MD arrays exist on the target + raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}" # Retry policy raid_retries: "{{ RETRIES }}" @@ -54,8 +57,9 @@ # Parse /proc/mdstat and validate MD RAID state import re, sys - md = "{{ raid_md_device }}" + target = "{{ raid_md_device }}" allow_sync = int("{{ raid_allow_sync }}") + allow_no_array = int("{{ raid_allow_no_array }}") try: txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read() @@ -63,23 +67,51 @@ print(f"ERROR: cannot read /proc/mdstat: {e}") sys.exit(2) - pat = re.compile( - rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]", - re.MULTILINE - ) - m = pat.search(txt) - if not m: - print(f"ERROR: {md} not found in /proc/mdstat") + # Find all md arrays present + # We parse tokens like: [2/2] [UU] + arrays = {} + header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE) + token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE) + + for m in header_re.finditer(txt): + name = m.group(1) + start = m.end() + # Look ahead for the next token line after this header + chunk = txt[start:start + 3000] + tm = token_re.search(chunk) + if tm: + arrays[name] = tm.group(1) + + if not arrays: + msg = "NO_MD_ARRAYS: /proc/mdstat contains no active md arrays." + print(msg) print(txt.strip()) - sys.exit(2) + sys.exit(0 if allow_no_array else 2) - token = m.group(1) syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt)) - degraded = "_" in token - print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}") + # Decide which arrays to check + if target == "auto": + to_check = sorted(arrays.keys()) + else: + if target not in arrays: + print(f"ERROR: {target} not found in /proc/mdstat. Found={sorted(arrays.keys())}") + print(txt.strip()) + sys.exit(2) + to_check = [target] - if degraded: + bad = [] + for name in to_check: + token = arrays[name] + degraded = "_" in token + bad.append((name, token, degraded)) + + # Print summary + for name, token, degraded in bad: + print(f"RAID={name} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}") + + # Fail conditions + if any(degraded for _, _, degraded in bad): sys.exit(1) if syncing and not allow_sync: @@ -115,11 +147,10 @@ label: "cmd-{{ idx }}" register: raid_cmds changed_when: false - failed_when: false # do not stop early; assert below decides + failed_when: false no_log: "{{ DEBUG == 0 }}" retries: "{{ raid_retries }}" delay: "{{ raid_delay }}" - # Retry only on typical SSH/timeout failures (255=ssh error, 124=timeout) until: raid_cmds.rc not in [124, 255] - name: Show outputs for each RAID command