forked from jakub/ansible
redo
This commit is contained in:
172
check_raid.yml
172
check_raid.yml
@@ -8,14 +8,18 @@
|
|||||||
become_method: sudo
|
become_method: sudo
|
||||||
|
|
||||||
vars:
|
vars:
|
||||||
|
# VM connection (provided by Semaphore env vars)
|
||||||
vm_ip: "{{ lookup('env', 'VM_IP') }}"
|
vm_ip: "{{ lookup('env', 'VM_IP') }}"
|
||||||
vm_user: "{{ lookup('env', 'VM_USER') }}"
|
vm_user: "{{ lookup('env', 'VM_USER') }}"
|
||||||
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
|
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
|
||||||
use_sudo: false
|
use_sudo: false
|
||||||
|
|
||||||
|
# Debug mode
|
||||||
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
|
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
|
||||||
RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
|
RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
|
||||||
|
|
||||||
|
# RAID specifics
|
||||||
|
# RAID_MD can be: md0 / md1 / ... OR "auto" to check all arrays found in /proc/mdstat
|
||||||
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
|
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
|
||||||
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
|
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
|
||||||
raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}"
|
raid_allow_no_array: "{{ lookup('env', 'RAID_ALLOW_NO_ARRAY') | default(0, true) | int }}"
|
||||||
@@ -24,6 +28,7 @@
|
|||||||
raid_delay: 2
|
raid_delay: 2
|
||||||
ssh_hard_timeout: 30
|
ssh_hard_timeout: 30
|
||||||
|
|
||||||
|
# SSH options
|
||||||
ssh_opts:
|
ssh_opts:
|
||||||
- "-o" # English comments
|
- "-o" # English comments
|
||||||
- "StrictHostKeyChecking=no"
|
- "StrictHostKeyChecking=no"
|
||||||
@@ -44,64 +49,65 @@
|
|||||||
- "-o"
|
- "-o"
|
||||||
- "NumberOfPasswordPrompts=1"
|
- "NumberOfPasswordPrompts=1"
|
||||||
|
|
||||||
raid_commands:
|
raid_check_cmd: |
|
||||||
- |
|
python3 - <<'PY'
|
||||||
python3 - <<'PY'
|
# Print exactly one status line and exit with code:
|
||||||
# Parse /proc/mdstat and validate MD RAID state
|
# 0=OK, 1=FAIL (degraded/disallowed sync), 2=ERROR (unexpected/misconfig)
|
||||||
import re, sys
|
import re, sys
|
||||||
|
|
||||||
target = "{{ raid_md_device }}"
|
target = "{{ raid_md_device }}"
|
||||||
allow_sync = int("{{ raid_allow_sync }}")
|
allow_sync = int("{{ raid_allow_sync }}")
|
||||||
allow_no_array = int("{{ raid_allow_no_array }}")
|
allow_no_array = int("{{ raid_allow_no_array }}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
|
txt = open("/proc/mdstat", "r", encoding="utf-8", errors="ignore").read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"ERROR: cannot read /proc/mdstat: {e}")
|
print(f"ERROR RAID read_mdstat err={e}")
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
arrays = {}
|
arrays = {}
|
||||||
header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE)
|
header_re = re.compile(r"^(md\d+)\s*:\s*active.*$", re.MULTILINE)
|
||||||
token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE)
|
token_re = re.compile(r"^\s*\d+\s+blocks.*\[\d+/\d+\]\s*\[([U_]+)\]\s*$", re.MULTILINE)
|
||||||
|
|
||||||
for m in header_re.finditer(txt):
|
for m in header_re.finditer(txt):
|
||||||
name = m.group(1)
|
name = m.group(1)
|
||||||
chunk = txt[m.end():m.end() + 3000]
|
chunk = txt[m.end():m.end() + 3000]
|
||||||
tm = token_re.search(chunk)
|
tm = token_re.search(chunk)
|
||||||
if tm:
|
if tm:
|
||||||
arrays[name] = tm.group(1)
|
arrays[name] = tm.group(1)
|
||||||
|
|
||||||
if not arrays:
|
if not arrays:
|
||||||
print("NO_MD_ARRAYS: /proc/mdstat contains no active md arrays.")
|
if allow_no_array:
|
||||||
print(txt.strip())
|
print("OK RAID none=no-md-arrays")
|
||||||
sys.exit(0 if allow_no_array else 2)
|
sys.exit(0)
|
||||||
|
print("ERROR RAID none=no-md-arrays")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
|
syncing = bool(re.search(r"\b(resync|recovery|reshape|check|repair)\b", txt))
|
||||||
|
|
||||||
if target == "auto":
|
if target == "auto":
|
||||||
to_check = sorted(arrays.keys())
|
to_check = sorted(arrays.keys())
|
||||||
else:
|
else:
|
||||||
if target not in arrays:
|
if target not in arrays:
|
||||||
print(f"ERROR: {target} not found in /proc/mdstat. Found={sorted(arrays.keys())}")
|
found = ",".join(sorted(arrays.keys()))
|
||||||
print(txt.strip())
|
print(f"ERROR RAID target_not_found target={target} found={found}")
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
to_check = [target]
|
to_check = [target]
|
||||||
|
|
||||||
any_degraded = False
|
tokens_str = " ".join([f"{name}=[{arrays[name]}]" for name in to_check])
|
||||||
for name in to_check:
|
degraded = any("_" in arrays[name] for name in to_check)
|
||||||
token = arrays[name]
|
|
||||||
degraded = "_" in token
|
|
||||||
any_degraded = any_degraded or degraded
|
|
||||||
print(f"RAID={name} token=[{token}] degraded={degraded} syncing={syncing} allow_sync={allow_sync}")
|
|
||||||
|
|
||||||
if any_degraded:
|
if degraded:
|
||||||
sys.exit(1)
|
print(f"FAIL RAID {tokens_str} syncing={int(syncing)}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if syncing and not allow_sync:
|
if syncing and not allow_sync:
|
||||||
sys.exit(1)
|
print(f"FAIL RAID {tokens_str} syncing={int(syncing)} allow_sync={allow_sync}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
sys.exit(0)
|
print(f"OK RAID {tokens_str} syncing={int(syncing)}")
|
||||||
PY
|
sys.exit(0)
|
||||||
|
PY
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Ensure sshpass is installed (for password-based SSH) # English comments
|
- name: Ensure sshpass is installed (for password-based SSH) # English comments
|
||||||
@@ -109,8 +115,9 @@
|
|||||||
name: sshpass
|
name: sshpass
|
||||||
state: present
|
state: present
|
||||||
update_cache: yes
|
update_cache: yes
|
||||||
|
run_once: true
|
||||||
|
|
||||||
- name: Run RAID check commands on VM (via SSH) # use SSHPASS env, hide item label
|
- name: Run RAID check on VM (via SSH) # single command, no loop
|
||||||
ansible.builtin.command:
|
ansible.builtin.command:
|
||||||
argv: >-
|
argv: >-
|
||||||
{{
|
{{
|
||||||
@@ -119,43 +126,58 @@
|
|||||||
+ ssh_opts
|
+ ssh_opts
|
||||||
+ [ vm_user ~ '@' ~ vm_ip,
|
+ [ vm_user ~ '@' ~ vm_ip,
|
||||||
'bash', '-lc',
|
'bash', '-lc',
|
||||||
('sudo ' if use_sudo else '') + item
|
('sudo ' if use_sudo else '') + raid_check_cmd
|
||||||
]
|
]
|
||||||
}}
|
}}
|
||||||
environment:
|
environment:
|
||||||
SSHPASS: "{{ vm_pass }}"
|
SSHPASS: "{{ vm_pass }}"
|
||||||
loop: "{{ raid_commands }}"
|
register: raid_cmd
|
||||||
loop_control:
|
|
||||||
index_var: idx
|
|
||||||
label: "cmd-{{ idx }}"
|
|
||||||
register: raid_cmds
|
|
||||||
changed_when: false
|
changed_when: false
|
||||||
failed_when: false
|
failed_when: false # we decide via assert below
|
||||||
no_log: "{{ DEBUG == 0 }}"
|
|
||||||
retries: "{{ raid_retries }}"
|
retries: "{{ raid_retries }}"
|
||||||
delay: "{{ raid_delay }}"
|
delay: "{{ raid_delay }}"
|
||||||
until: raid_cmds.rc not in [124, 255]
|
until: raid_cmd.rc not in [124, 255]
|
||||||
run_once: true
|
run_once: true
|
||||||
|
|
||||||
- name: Show outputs for each RAID command
|
- name: Build one-line summary (always)
|
||||||
ansible.builtin.debug:
|
ansible.builtin.set_fact:
|
||||||
msg: |
|
raid_line: >-
|
||||||
RC: {{ item.rc }}
|
{{
|
||||||
STDOUT:
|
(raid_cmd.stdout | default('') | trim)
|
||||||
{{ (item.stdout | default('')).strip() }}
|
if ((raid_cmd.stdout | default('') | trim) | length) > 0
|
||||||
STDERR:
|
else ('ERROR RAID no-output rc=' ~ (raid_cmd.rc | string))
|
||||||
{{ (item.stderr | default('')).strip() }}
|
}}
|
||||||
loop: "{{ (raid_cmds.results if (raid_cmds.results is defined) else [raid_cmds]) }}"
|
changed_when: false
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: RAID result (always one line)
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- raid_cmd.rc == 0
|
||||||
|
success_msg: "{{ raid_line }}"
|
||||||
|
fail_msg: "{{ raid_line }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# Optional verbose debug
|
||||||
|
- name: Debug | /proc/mdstat (VM)
|
||||||
|
ansible.builtin.command:
|
||||||
|
argv: >-
|
||||||
|
{{
|
||||||
|
['timeout', '-k', '5', (ssh_hard_timeout | string)]
|
||||||
|
+ ['sshpass', '-e', 'ssh']
|
||||||
|
+ ssh_opts
|
||||||
|
+ [ vm_user ~ '@' ~ vm_ip, 'bash', '-lc', "cat /proc/mdstat" ]
|
||||||
|
}}
|
||||||
|
environment:
|
||||||
|
SSHPASS: "{{ vm_pass }}"
|
||||||
|
register: mdstat_dbg
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
when: DEBUG == 1
|
when: DEBUG == 1
|
||||||
run_once: true
|
run_once: true
|
||||||
|
|
||||||
- name: Fail play if RAID check failed # English comments
|
- name: Debug | mdstat output
|
||||||
ansible.builtin.assert:
|
ansible.builtin.debug:
|
||||||
that: "item.rc == 0"
|
msg: "{{ mdstat_dbg.stdout | default('') }}"
|
||||||
fail_msg: "RAID check failed on VM: {{ (item.stdout | default(item.stderr) | default('no output')) | trim }}"
|
when: DEBUG == 1
|
||||||
success_msg: "RAID check OK."
|
|
||||||
loop: "{{ (raid_cmds.results if (raid_cmds.results is defined) else [raid_cmds]) }}"
|
|
||||||
loop_control:
|
|
||||||
index_var: idx
|
|
||||||
label: "cmd-{{ idx }}"
|
|
||||||
run_once: true
|
run_once: true
|
||||||
|
|||||||
Reference in New Issue
Block a user