This commit is contained in:
martin.fencl
2025-12-23 23:38:23 +01:00
parent 4038f5b6a1
commit e47ccb64b7

View File

@@ -8,14 +8,18 @@
become_method: sudo become_method: sudo
vars: vars:
# VM connection (provided by Semaphore env vars)
vm_ip: "{{ lookup('env', 'VM_IP') }}" vm_ip: "{{ lookup('env', 'VM_IP') }}"
vm_user: "{{ lookup('env', 'VM_USER') }}" vm_user: "{{ lookup('env', 'VM_USER') }}"
vm_pass: "{{ lookup('env', 'VM_PASS') }}" vm_pass: "{{ lookup('env', 'VM_PASS') }}"
use_sudo: false use_sudo: false
# Debug mode
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}" RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
# RAID specifics
# RAID_MD can be: md0 / md1 / ... OR "auto" to check all arrays found in /proc/mdstat
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}" raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}" raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}" raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}"
@@ -24,6 +28,7 @@
raid_delay: 2 raid_delay: 2
ssh_hard_timeout: 30 ssh_hard_timeout: 30
# SSH options
ssh_opts: ssh_opts:
- "-o" # English comments - "-o" # English comments
- "StrictHostKeyChecking=no" - "StrictHostKeyChecking=no"
@@ -44,10 +49,10 @@
- "-o" - "-o"
- "NumberOfPasswordPrompts=1" - "NumberOfPasswordPrompts=1"
raid_commands: raid_check_cmd: |
- |
python3 - <<'PY' python3 - <<'PY'
# Parse /proc/mdstat and validate MD RAID state # Print exactly one status line and exit with code:
# 0=OK, 1=FAIL (degraded/disallowed sync), 2=ERROR (unexpected/misconfig)
import re, sys import re, sys
target = "{{ raid_md_device }}" target = "{{ raid_md_device }}"
@@ -57,7 +62,7 @@
try: try:
txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read() txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
except Exception as e: except Exception as e:
print(f"ERROR: cannot read /proc/mdstat: {e}") print(f"ERROR RAID read_mdstat err={e}")
sys.exit(2) sys.exit(2)
arrays = {} arrays = {}
@@ -72,9 +77,11 @@
arrays[name] = tm.group(1) arrays[name] = tm.group(1)
if not arrays: if not arrays:
print("NO_MD_ARRAYS: /proc/mdstat contains no active md arrays.") if allow_no_array:
print(txt.strip()) print("OK RAID none=no-md-arrays")
sys.exit(0 if allow_no_array else 2) sys.exit(0)
print("ERROR RAID none=no-md-arrays")
sys.exit(2)
syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt)) syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
@@ -82,24 +89,23 @@
to_check = sorted(arrays.keys()) to_check = sorted(arrays.keys())
else: else:
if target not in arrays: if target not in arrays:
print(f"ERROR: {target} not found in /proc/mdstat. Found={sorted(arrays.keys())}") found = ",".join(sorted(arrays.keys()))
print(txt.strip()) print(f"ERROR RAID target_not_found target={target} found={found}")
sys.exit(2) sys.exit(2)
to_check = [target] to_check = [target]
any_degraded = False tokens_str = " ".join([f"{name}=[{arrays[name]}]" for name in to_check])
for name in to_check: degraded = any("_" in arrays[name] for name in to_check)
token = arrays[name]
degraded = "_" in token
any_degraded = any_degraded or degraded
print(f"RAID={name} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
if any_degraded: if degraded:
print(f"FAIL RAID {tokens_str} syncing={int(syncing)}")
sys.exit(1) sys.exit(1)
if syncing and not allow_sync: if syncing and not allow_sync:
print(f"FAIL RAID {tokens_str} syncing={int(syncing)} allow_sync={allow_sync}")
sys.exit(1) sys.exit(1)
print(f"OK RAID {tokens_str} syncing={int(syncing)}")
sys.exit(0) sys.exit(0)
PY PY
@@ -109,8 +115,9 @@
name: sshpass name: sshpass
state: present state: present
update_cache: yes update_cache: yes
run_once: true
- name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label - name: Run RAID check on VM (via SSH) # single command, no loop
ansible.builtin.command: ansible.builtin.command:
argv: >- argv: >-
{{ {{
@@ -119,43 +126,58 @@
+ ssh_opts + ssh_opts
+ [ vm_user ~ '@' ~ vm_ip, + [ vm_user ~ '@' ~ vm_ip,
'bash', '-lc', 'bash', '-lc',
('sudo ' if use_sudo else '') + item ('sudo ' if use_sudo else '') + raid_check_cmd
] ]
}} }}
environment: environment:
SSHPASS: "{{ vm_pass }}" SSHPASS: "{{ vm_pass }}"
loop: "{{ raid_commands }}" register: raid_cmd
loop_control:
index_var: idx
label: "cmd-{{ idx }}"
register: raid_cmds
changed_when: false changed_when: false
failed_when: false failed_when: false # we decide via assert below
no_log: "{{ DEBUG == 0 }}"
retries: "{{ raid_retries }}" retries: "{{ raid_retries }}"
delay: "{{ raid_delay }}" delay: "{{ raid_delay }}"
until: raid_cmds.rc not in [124, 255] until: raid_cmd.rc not in [124, 255]
run_once: true run_once: true
- name: Show outputs for each RAID command - name: Build one-line summary (always)
ansible.builtin.debug: ansible.builtin.set_fact:
msg: | raid_line: >-
RC: {{ item.rc }} {{
STDOUT: (raid_cmd.stdout | default('') | trim)
{{ (item.stdout | default('')).strip() }} if ((raid_cmd.stdout | default('') | trim) | length) > 0
STDERR: else ('ERROR RAID no-output rc=' ~ (raid_cmd.rc | string))
{{ (item.stderr | default('')).strip() }} }}
loop: "{{ (raid_cmds.results if (raid_cmds.results is defined) else [raid_cmds]) }}" changed_when: false
run_once: true
- name: RAID result (always one line)
ansible.builtin.assert:
that:
- raid_cmd.rc == 0
success_msg: "{{ raid_line }}"
fail_msg: "{{ raid_line }}"
run_once: true
# Optional verbose debug
- name: Debug | /proc/mdstat (VM)
ansible.builtin.command:
argv: >-
{{
['timeout', '-k', '5', (ssh_hard_timeout | string)]
+ ['sshpass', '-e', 'ssh']
+ ssh_opts
+ [ vm_user ~ '@' ~ vm_ip, 'bash', '-lc', "cat /proc/mdstat" ]
}}
environment:
SSHPASS: "{{ vm_pass }}"
register: mdstat_dbg
changed_when: false
failed_when: false
when: DEBUG == 1 when: DEBUG == 1
run_once: true run_once: true
- name: Fail play if RAID check failed # English comments - name: Debug | mdstat output
ansible.builtin.assert: ansible.builtin.debug:
that: "item.rc == 0" msg: "{{ mdstat_dbg.stdout | default('') }}"
fail_msg: "RAID check failed on VM: {{ (item.stdout | default(item.stderr) | default('no output')) | trim }}" when: DEBUG == 1
success_msg: "RAID check OK."
loop: "{{ (raid_cmds.results if (raid_cmds.results is defined) else [raid_cmds]) }}"
loop_control:
index_var: idx
label: "cmd-{{ idx }}"
run_once: true run_once: true