redo

2025-12-23 23:34:57 +01:00
parent 74c3ef8945
commit 4038f5b6a1
1 changed files with 12 additions and 27 deletions
--- a/check_raid.yml
+++ b/check_raid.yml
@@ -8,38 +8,32 @@
  become_method: sudo
  vars:
    # VM connection (provided by Semaphore env vars)
    vm_ip:   "{{ lookup('env', 'VM_IP') }}"
    vm_user: "{{ lookup('env', 'VM_USER') }}"
    vm_pass: "{{ lookup('env', 'VM_PASS') }}"
    use_sudo: false
    # --- Debug mode (controlled via Semaphore variable) ---
    DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
    RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
    # --- RAID specifics ---
    # RAID_MD can be: md0 / md1 / ... OR "auto" to check all arrays found in /proc/mdstat
    raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
    # 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected
    raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
    # 1 = do not fail when no MD arrays exist on the target
    raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}"
    # Retry policy
    raid_retries: "{{ RETRIES }}"
    raid_delay: 2
    # Hard timeout for the whole SSH command (prevents hanging forever)
    ssh_hard_timeout: 30
    # SSH options (same style, but avoids auth prompts)
    ssh_opts:
      - "-o"  # English comments
      - "StrictHostKeyChecking=no"
      - "-o"
      - "UserKnownHostsFile=/dev/null"
      - "-o"
      - "GlobalKnownHostsFile=/dev/null"
      - "-o"
      - "LogLevel=ERROR"
      - "-o"
      - "ConnectTimeout=15"
      - "-o"
      - "PreferredAuthentications=password"
@@ -50,7 +44,6 @@
      - "-o"
      - "NumberOfPasswordPrompts=1"
    # Commands to run on the target VM
    raid_commands:
      - |
        python3 - <<'PY'
@@ -67,30 +60,24 @@
            print(f"ERROR: cannot read /proc/mdstat: {e}")
            sys.exit(2)
        # Find all md arrays present
        # We parse tokens like: [2/2] [UU]
        arrays = {}
        header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE)
        token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE)
        for m in header_re.finditer(txt):
            name = m.group(1)
-            start = m.end()
+            chunk = txt[m.end():m.end() + 3000]
            # Look ahead for the next token line after this header
            chunk = txt[start:start + 3000]
            tm = token_re.search(chunk)
            if tm:
                arrays[name] = tm.group(1)
        if not arrays:
-            msg = "NO_MD_ARRAYS: /proc/mdstat contains no active md arrays."
+            print("NO_MD_ARRAYS: /proc/mdstat contains no active md arrays.")
            print(msg)
            print(txt.strip())
            sys.exit(0 if allow_no_array else 2)
        syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
        # Decide which arrays to check
        if target == "auto":
            to_check = sorted(arrays.keys())
        else:
@@ -100,18 +87,14 @@
                sys.exit(2)
            to_check = [target]
-        bad = []
+        any_degraded = False
        for name in to_check:
            token = arrays[name]
            degraded = "_" in token
-            bad.append((name, token, degraded))
+            any_degraded = any_degraded or degraded
        # Print summary
        for name, token, degraded in bad:
            print(f"RAID={name} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
-        # Fail conditions
+        if any_degraded:
        if any(degraded for _, _, degraded in bad):
            sys.exit(1)
        if syncing and not allow_sync:
@@ -152,11 +135,11 @@
      retries: "{{ raid_retries }}"
      delay: "{{ raid_delay }}"
      until: raid_cmds.rc not in [124, 255]
      run_once: true
    - name: Show outputs for each RAID command
      ansible.builtin.debug:
        msg: |
          CMD: {{ item.item | default('n/a') }}
          RC:  {{ item.rc }}
          STDOUT:
          {{ (item.stdout | default('')).strip() }}
@@ -164,6 +147,7 @@
          {{ (item.stderr | default('')).strip() }}
      loop: "{{ (raid_cmds.results if (raid_cmds.results is defined) else [raid_cmds]) }}"
      when: DEBUG == 1
      run_once: true
    - name: Fail play if RAID check failed  # English comments
      ansible.builtin.assert:
@@ -174,3 +158,4 @@
      loop_control:
        index_var: idx
        label: "cmd-{{ idx }}"
      run_once: true