forked from jakub/ansible
redo
This commit is contained in:
184
check_raid.yml
184
check_raid.yml
@@ -14,19 +14,80 @@
|
|||||||
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
|
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
|
||||||
use_sudo: false
|
use_sudo: false
|
||||||
|
|
||||||
# Debug / retries
|
# --- Debug mode (controlled via Semaphore variable) ---
|
||||||
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
|
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
|
||||||
RETRIES: "{{ lookup('env', 'RETRIES') | default(10) | int }}"
|
RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
|
||||||
|
|
||||||
# RAID device to check (e.g. md0, md1...)
|
# --- RAID specifics ---
|
||||||
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
|
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
|
||||||
|
# 1 = allow resync/recovery/reshape/check/repair; 0 = fail when such activity is detected
|
||||||
# If 0 => fail when resync/recovery/reshape/check/repair is detected in /proc/mdstat
|
|
||||||
# If 1 => allow sync operations (still fails only on degraded [U_], [_U], etc.)
|
|
||||||
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
|
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
|
||||||
|
|
||||||
# SSH options
|
# Retry policy
|
||||||
ssh_connect_timeout: 15
|
raid_retries: "{{ RETRIES }}"
|
||||||
|
raid_delay: 2
|
||||||
|
|
||||||
|
# SSH options (keeps same style as your working playbooks, but avoids auth prompts)
|
||||||
|
ssh_opts:
|
||||||
|
- "-o"
|
||||||
|
- "StrictHostKeyChecking=no"
|
||||||
|
- "-o"
|
||||||
|
- "UserKnownHostsFile=/dev/null"
|
||||||
|
- "-o"
|
||||||
|
- "ConnectTimeout=15"
|
||||||
|
- "-o"
|
||||||
|
- "PreferredAuthentications=password"
|
||||||
|
- "-o"
|
||||||
|
- "PubkeyAuthentication=no"
|
||||||
|
- "-o"
|
||||||
|
- "KbdInteractiveAuthentication=no"
|
||||||
|
- "-o"
|
||||||
|
- "NumberOfPasswordPrompts=1"
|
||||||
|
|
||||||
|
# Commands to run on the target VM
|
||||||
|
raid_commands:
|
||||||
|
- |
|
||||||
|
python3 - <<'PY'
|
||||||
|
# Parse /proc/mdstat and validate MD RAID state
|
||||||
|
import re, sys
|
||||||
|
|
||||||
|
md = "{{ raid_md_device }}"
|
||||||
|
allow_sync = int("{{ raid_allow_sync }}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: cannot read /proc/mdstat: {e}")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
# Find token like [UU] / [U_] for the selected md device
|
||||||
|
# Example lines:
|
||||||
|
# md0 : active raid1 sdb1[0] sdc1[1]
|
||||||
|
# 11718751232 blocks super 1.2 [2/2] [UU]
|
||||||
|
pat = re.compile(
|
||||||
|
rf"^{re.escape(md)}\s*:\s*active.*\n\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]",
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
m = pat.search(txt)
|
||||||
|
if not m:
|
||||||
|
print(f"ERROR: {md} not found in /proc/mdstat")
|
||||||
|
print(txt.strip())
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
token = m.group(1)
|
||||||
|
syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
|
||||||
|
|
||||||
|
degraded = "_" in token
|
||||||
|
print(f"RAID={md} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
|
||||||
|
|
||||||
|
if degraded:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if syncing and not allow_sync:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
PY
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Ensure sshpass is installed (for password-based SSH) # English comments
|
- name: Ensure sshpass is installed (for password-based SSH) # English comments
|
||||||
@@ -35,83 +96,64 @@
|
|||||||
state: present
|
state: present
|
||||||
update_cache: yes
|
update_cache: yes
|
||||||
|
|
||||||
- name: Read /proc/mdstat from VM (via SSH) # English comments
|
- name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label
|
||||||
ansible.builtin.command:
|
ansible.builtin.command:
|
||||||
argv:
|
argv:
|
||||||
- sshpass
|
- sshpass
|
||||||
- -e
|
- -e
|
||||||
- ssh
|
- ssh
|
||||||
- -o
|
- "{{ ssh_opts[0] }}"
|
||||||
- StrictHostKeyChecking=no
|
- "{{ ssh_opts[1] }}"
|
||||||
- -o
|
- "{{ ssh_opts[2] }}"
|
||||||
- "ConnectTimeout={{ ssh_connect_timeout }}"
|
- "{{ ssh_opts[3] }}"
|
||||||
|
- "{{ ssh_opts[4] }}"
|
||||||
|
- "{{ ssh_opts[5] }}"
|
||||||
|
- "{{ ssh_opts[6] }}"
|
||||||
|
- "{{ ssh_opts[7] }}"
|
||||||
|
- "{{ ssh_opts[8] }}"
|
||||||
|
- "{{ ssh_opts[9] }}"
|
||||||
|
- "{{ ssh_opts[10] }}"
|
||||||
|
- "{{ ssh_opts[11] }}"
|
||||||
|
- "{{ ssh_opts[12] }}"
|
||||||
|
- "{{ ssh_opts[13] }}"
|
||||||
|
- "{{ ssh_opts[14] }}"
|
||||||
|
- "{{ ssh_opts[15] }}"
|
||||||
- "{{ vm_user }}@{{ vm_ip }}"
|
- "{{ vm_user }}@{{ vm_ip }}"
|
||||||
- bash
|
- bash
|
||||||
- -lc
|
- -lc
|
||||||
- "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}"
|
- "{{ ('sudo ' if use_sudo else '') + item }}"
|
||||||
environment:
|
environment:
|
||||||
SSHPASS: "{{ vm_pass }}"
|
SSHPASS: "{{ vm_pass }}"
|
||||||
register: mdstat_raw
|
loop: "{{ raid_commands }}"
|
||||||
|
loop_control:
|
||||||
|
index_var: idx
|
||||||
|
label: "cmd-{{ idx }}"
|
||||||
|
register: raid_cmds
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
failed_when: false # do not stop early; assert below decides
|
||||||
no_log: "{{ DEBUG == 0 }}"
|
no_log: "{{ DEBUG == 0 }}"
|
||||||
retries: "{{ RETRIES }}"
|
retries: "{{ raid_retries }}"
|
||||||
delay: 2
|
delay: "{{ raid_delay }}"
|
||||||
until: mdstat_raw.rc == 0
|
until: raid_cmds is succeeded # command executed (rc can be non-zero; we handle later)
|
||||||
|
|
||||||
- name: Debug | Show raw /proc/mdstat # English comments
|
- name: Show outputs for each RAID command
|
||||||
ansible.builtin.debug:
|
ansible.builtin.debug:
|
||||||
msg: "{{ mdstat_raw.stdout }}"
|
msg: |
|
||||||
|
CMD: {{ item.item }}
|
||||||
|
RC: {{ item.rc }}
|
||||||
|
STDOUT:
|
||||||
|
{{ (item.stdout | default('')).strip() }}
|
||||||
|
STDERR:
|
||||||
|
{{ (item.stderr | default('')).strip() }}
|
||||||
|
loop: "{{ raid_cmds.results }}"
|
||||||
when: DEBUG == 1
|
when: DEBUG == 1
|
||||||
|
|
||||||
- name: Extract RAID status token for selected MD device # English comments
|
- name: Fail play if RAID check failed # English comments
|
||||||
ansible.builtin.set_fact:
|
|
||||||
raid_token: >-
|
|
||||||
{{
|
|
||||||
(mdstat_raw.stdout | regex_search(
|
|
||||||
raid_md_device ~ '\\s*:\\s*active.*\\n\\s*\\d+\\s+blocks.*\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]',
|
|
||||||
'\\1',
|
|
||||||
multiline=True
|
|
||||||
)) | default('')
|
|
||||||
}}
|
|
||||||
|
|
||||||
- name: Detect sync operations in mdstat (resync/recovery/reshape/check/repair) # English comments
|
|
||||||
ansible.builtin.set_fact:
|
|
||||||
raid_syncing: "{{ (mdstat_raw.stdout is search('resync|recovery|reshape|check|repair')) | bool }}"
|
|
||||||
|
|
||||||
- name: Compute degraded flag (underscore means missing member) # English comments
|
|
||||||
ansible.builtin.set_fact:
|
|
||||||
raid_degraded: "{{ (raid_token | length > 0) and ('_' in raid_token) }}"
|
|
||||||
|
|
||||||
- name: Fail if MD device not found in /proc/mdstat # English comments
|
|
||||||
ansible.builtin.assert:
|
ansible.builtin.assert:
|
||||||
that:
|
that: "item.rc == 0"
|
||||||
- raid_token | length > 0
|
fail_msg: "RAID check failed on VM: {{ item.stdout | default(item.stderr) | default('no output') }}"
|
||||||
fail_msg: "RAID device {{ raid_md_device }} was not found in /proc/mdstat on VM ({{ vm_ip }})."
|
success_msg: "RAID check OK."
|
||||||
success_msg: "RAID device {{ raid_md_device }} found in /proc/mdstat."
|
loop: "{{ raid_cmds.results }}"
|
||||||
changed_when: false
|
loop_control:
|
||||||
|
index_var: idx
|
||||||
- name: Fail if RAID is degraded (token contains '_') # English comments
|
label: "cmd-{{ idx }}"
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- not raid_degraded
|
|
||||||
fail_msg: "RAID {{ raid_md_device }} is DEGRADED: token=[{{ raid_token }}] (expected all 'U')."
|
|
||||||
success_msg: "RAID {{ raid_md_device }} is OK: token=[{{ raid_token }}]."
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Fail if RAID is syncing and syncing is not allowed # English comments
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- (raid_allow_sync | int) == 1 or (not raid_syncing)
|
|
||||||
fail_msg: "RAID {{ raid_md_device }} is running a sync operation (resync/recovery/reshape/check/repair) and RAID_ALLOW_SYNC=0."
|
|
||||||
success_msg: "No sync operation detected (or RAID_ALLOW_SYNC=1)."
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Print concise summary (debug) # English comments
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: >-
|
|
||||||
RAID={{ raid_md_device }},
|
|
||||||
token=[{{ raid_token }}],
|
|
||||||
degraded={{ raid_degraded }},
|
|
||||||
syncing={{ raid_syncing }},
|
|
||||||
allow_sync={{ raid_allow_sync }}
|
|
||||||
when: DEBUG == 1
|
|
||||||
|
|||||||
Reference in New Issue
Block a user