redo

2025-12-23 23:14:47 +01:00
parent d413dcb29f
commit 74c3ef8945
1 changed files with 46 additions and 15 deletions
@@ -19,9 +19,12 @@
    RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"

    # --- RAID specifics ---
+    # RAID_MD can be: md0 / md1 / ... OR "auto" to check all arrays found in /proc/mdstat
    raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
    # 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected
    raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
+    # 1 = do not fail when no MD arrays exist on the target
+    raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}"

    # Retry policy
    raid_retries: "{{ RETRIES }}"
@@ -54,8 +57,9 @@
        # Parse /proc/mdstat and validate MD RAID state
        import re, sys

-        md = "{{ raid_md_device }}"
+        target = "{{ raid_md_device }}"
        allow_sync = int("{{ raid_allow_sync }}")
+        allow_no_array = int("{{ raid_allow_no_array }}")

        try:
            txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
@@ -63,23 +67,51 @@
            print(f"ERROR: cannot read /proc/mdstat: {e}")
            sys.exit(2)

-        pat = re.compile(
-            rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]",
-            re.MULTILINE
-        )
-        m = pat.search(txt)
-        if not m:
-            print(f"ERROR: {md} not found in /proc/mdstat")
+        # Find all md arrays present
+        # We parse tokens like: [2/2] [UU]
+        arrays = {}
+        header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE)
+        token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE)
+
+        for m in header_re.finditer(txt):
+            name = m.group(1)
+            start = m.end()
+            # Look ahead for the next token line after this header
+            chunk = txt[start:start + 3000]
+            tm = token_re.search(chunk)
+            if tm:
+                arrays[name] = tm.group(1)
+
+        if not arrays:
+            msg = "NO_MD_ARRAYS: /proc/mdstat contains no active md arrays."
+            print(msg)
            print(txt.strip())
-            sys.exit(2)
+            sys.exit(0 if allow_no_array else 2)

-        token = m.group(1)
        syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
-        degraded = "_" in token

-        print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
+        # Decide which arrays to check
+        if target == "auto":
+            to_check = sorted(arrays.keys())
+        else:
+            if target not in arrays:
+                print(f"ERROR: {target} not found in /proc/mdstat. Found={sorted(arrays.keys())}")
+                print(txt.strip())
+                sys.exit(2)
+            to_check = [target]

-        if degraded:
+        bad = []
+        for name in to_check:
+            token = arrays[name]
+            degraded = "_" in token
+            bad.append((name, token, degraded))
+
+        # Print summary
+        for name, token, degraded in bad:
+            print(f"RAID={name} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
+
+        # Fail conditions
+        if any(degraded for _, _, degraded in bad):
            sys.exit(1)

        if syncing and not allow_sync:
@@ -115,11 +147,10 @@
        label: "cmd-{{ idx }}"
      register: raid_cmds
      changed_when: false
-      failed_when: false            # do not stop early; assert below decides
+      failed_when: false
      no_log: "{{ DEBUG == 0 }}"
      retries: "{{ raid_retries }}"
      delay: "{{ raid_delay }}"
-      # Retry only on typical SSH/timeout failures (255=ssh error, 124=timeout)
      until: raid_cmds.rc not in [124, 255]

    - name: Show outputs for each RAID command