redo
This commit is contained in:
221
check_raid.yml
221
check_raid.yml
@@ -1,7 +1,7 @@
|
||||
# check_raid.yml
|
||||
|
||||
- name: Check mdadm RAID status on VM (via SSH from proxmox_nextcloud)
|
||||
hosts: proxmox_nextcloud
|
||||
- name: Check Linux MD RAID health on VM via Proxmox
|
||||
hosts: linux_servers
|
||||
gather_facts: false
|
||||
become: true
|
||||
become_user: root
|
||||
@@ -14,17 +14,19 @@
|
||||
vm_pass: "{{ lookup('env', 'VM_PASS') }}"
|
||||
use_sudo: false
|
||||
|
||||
# Debug / behavior toggles (controlled via Semaphore variables)
|
||||
# Debug / retries
|
||||
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
|
||||
RETRIES: "{{ lookup('env', 'RETRIES') | default(10) | int }}"
|
||||
|
||||
# Which md device to check (md0, md1, ... or /dev/md0)
|
||||
RAID_DEVICE: "{{ lookup('env', 'RAID_DEVICE') | default('md0', true) }}"
|
||||
# RAID device to check (e.g. md0, md1...)
|
||||
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
|
||||
|
||||
# Fail the job if rebuild/resync is in progress (recommended)
|
||||
FAIL_ON_RESYNC: "{{ lookup('env', 'FAIL_ON_RESYNC') | default(1) | int }}"
|
||||
# If 0 => fail when resync/recovery/reshape/check/repair is detected in /proc/mdstat
|
||||
# If 1 => allow sync operations (still fails only on degraded [U_], [_U], etc.)
|
||||
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
|
||||
|
||||
# Fail the job if a periodic "check" is running (usually set 0 to avoid noise)
|
||||
FAIL_ON_CHECK: "{{ lookup('env', 'FAIL_ON_CHECK') | default(0) | int }}"
|
||||
# SSH options
|
||||
ssh_connect_timeout: 15
|
||||
|
||||
tasks:
|
||||
- name: Ensure sshpass is installed (for password-based SSH) # English comments
|
||||
@@ -32,185 +34,84 @@
|
||||
name: sshpass
|
||||
state: present
|
||||
update_cache: yes
|
||||
cache_valid_time: 86400
|
||||
|
||||
- name: Read /proc/mdstat from VM (via SSH) # English comments
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- timeout
|
||||
- 25s
|
||||
- sshpass
|
||||
- -e
|
||||
- ssh
|
||||
- -o
|
||||
- StrictHostKeyChecking=no
|
||||
- -o
|
||||
- UserKnownHostsFile=/dev/null
|
||||
- -o
|
||||
- ConnectTimeout=10
|
||||
- -o
|
||||
- ConnectionAttempts=1
|
||||
- -o
|
||||
- NumberOfPasswordPrompts=1
|
||||
- -o
|
||||
- PubkeyAuthentication=no
|
||||
- -o
|
||||
- GSSAPIAuthentication=no
|
||||
- -o
|
||||
- PasswordAuthentication=yes
|
||||
- -o
|
||||
- ServerAliveInterval=5
|
||||
- -o
|
||||
- ServerAliveCountMax=2
|
||||
- "ConnectTimeout={{ ssh_connect_timeout }}"
|
||||
- "{{ vm_user }}@{{ vm_ip }}"
|
||||
- "cat /proc/mdstat"
|
||||
- bash
|
||||
- -lc
|
||||
- "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}"
|
||||
environment:
|
||||
SSHPASS: "{{ vm_pass }}"
|
||||
register: mdstat_cmd
|
||||
register: mdstat_raw
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
no_log: "{{ DEBUG == 0 }}"
|
||||
retries: "{{ RETRIES }}"
|
||||
delay: 2
|
||||
until: mdstat_raw.rc == 0
|
||||
|
||||
- name: Fail if we cannot read /proc/mdstat (SSH/auth/network) # English comments
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- mdstat_cmd.rc == 0
|
||||
fail_msg: >-
|
||||
Cannot read /proc/mdstat from {{ vm_ip }} (rc={{ mdstat_cmd.rc }}).
|
||||
stderr={{ (mdstat_cmd.stderr | default('') | trim) }}
|
||||
success_msg: "Successfully read /proc/mdstat from {{ vm_ip }}."
|
||||
changed_when: false
|
||||
|
||||
- name: Build base variables # English comments
|
||||
ansible.builtin.set_fact:
|
||||
raid_md: "{{ RAID_DEVICE | regex_replace('^/dev/', '') }}"
|
||||
mdstat_text: "{{ mdstat_cmd.stdout | default('') }}"
|
||||
changed_when: false
|
||||
|
||||
- name: Extract selected md block from mdstat (no regex) # English comments
|
||||
ansible.builtin.set_fact:
|
||||
raid_block: >-
|
||||
{%- set ns = namespace(block='') -%}
|
||||
{%- for b in (mdstat_text.split('\n\n')) -%}
|
||||
{%- set bb = (b | trim) -%}
|
||||
{%- if bb.startswith(raid_md ~ ' :') or bb.startswith(raid_md ~ ':') -%}
|
||||
{%- set ns.block = bb -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{{ ns.block }}
|
||||
changed_when: false
|
||||
|
||||
- name: Parse RAID status from mdstat (no regex) # English comments
|
||||
ansible.builtin.set_fact:
|
||||
raid_present: "{{ (raid_block | trim | length) > 0 }}"
|
||||
|
||||
# Extract [UU] / [U_] token by scanning bracket tokens and keeping only U/_ chars
|
||||
raid_status: >-
|
||||
{%- set ns = namespace(st='') -%}
|
||||
{%- for line in (raid_block.splitlines() if (raid_block | length) > 0 else []) -%}
|
||||
{%- for tok in line.split() -%}
|
||||
{%- if tok.startswith('[') and tok.endswith(']') -%}
|
||||
{%- set inner = tok[1:-1] -%}
|
||||
{%- if (inner | length) > 0 and ((inner | list | difference(['U','_'])) | length == 0) -%}
|
||||
{%- set ns.st = inner -%}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endfor -%}
|
||||
{{ ns.st }}
|
||||
|
||||
raid_is_degraded: "{{ (raid_status | length > 0) and ('_' in raid_status) }}"
|
||||
|
||||
raid_block_lower: "{{ raid_block | lower }}"
|
||||
raid_is_rebuilding: >-
|
||||
{{
|
||||
('resync' in raid_block_lower) or
|
||||
('recovery' in raid_block_lower) or
|
||||
('reshape' in raid_block_lower) or
|
||||
('repair' in raid_block_lower)
|
||||
}}
|
||||
raid_is_checking: "{{ 'check' in raid_block_lower }}"
|
||||
|
||||
# First line that contains an action keyword
|
||||
raid_action_line: >-
|
||||
{%- set ns = namespace(line='') -%}
|
||||
{%- set keys = ['resync','recovery','reshape','repair','check'] -%}
|
||||
{%- for line in (raid_block.splitlines() if (raid_block | length) > 0 else []) -%}
|
||||
{%- set l = (line | lower) -%}
|
||||
{%- for k in keys -%}
|
||||
{%- if ns.line == '' and (k in l) -%}
|
||||
{%- set ns.line = (line | trim) -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endfor -%}
|
||||
{{ ns.line }}
|
||||
|
||||
# Parse progress from ".... = 12.3%" if present
|
||||
raid_progress: >-
|
||||
{%- set ns = namespace(p='') -%}
|
||||
{%- if (raid_action_line | length) > 0 and ('=' in raid_action_line) and ('%' in raid_action_line) -%}
|
||||
{%- set right = raid_action_line.split('=', 1)[1] -%}
|
||||
{%- set ns.p = (right.split('%', 1)[0] | trim) -%}
|
||||
{%- endif -%}
|
||||
{{ ns.p }}
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Debug | Show mdstat and parsed values # English comments
|
||||
- name: Debug | Show raw /proc/mdstat # English comments
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
--- /proc/mdstat ---
|
||||
{{ mdstat_text }}
|
||||
--- md block ({{ raid_md }}) ---
|
||||
{{ raid_block }}
|
||||
--- Parsed ---
|
||||
raid_present={{ raid_present }}
|
||||
raid_status={{ raid_status }}
|
||||
raid_is_degraded={{ raid_is_degraded }}
|
||||
raid_is_rebuilding={{ raid_is_rebuilding }}
|
||||
raid_is_checking={{ raid_is_checking }}
|
||||
raid_progress={{ raid_progress }}
|
||||
raid_action_line={{ raid_action_line }}
|
||||
msg: "{{ mdstat_raw.stdout }}"
|
||||
when: DEBUG == 1
|
||||
|
||||
- name: Fail if RAID device is not present # English comments
|
||||
- name: Extract RAID status token for selected MD device # English comments
|
||||
ansible.builtin.set_fact:
|
||||
raid_token: >-
|
||||
{{
|
||||
(mdstat_raw.stdout | regex_search(
|
||||
raid_md_device ~ '\\s*:\\s*active.*\\n\\s*\\d+\\s+blocks.*\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]',
|
||||
'\\1',
|
||||
multiline=True
|
||||
)) | default('')
|
||||
}}
|
||||
|
||||
- name: Detect sync operations in mdstat (resync/recovery/reshape/check/repair) # English comments
|
||||
ansible.builtin.set_fact:
|
||||
raid_syncing: "{{ (mdstat_raw.stdout is search('resync|recovery|reshape|check|repair')) | bool }}"
|
||||
|
||||
- name: Compute degraded flag (underscore means missing member) # English comments
|
||||
ansible.builtin.set_fact:
|
||||
raid_degraded: "{{ (raid_token | length > 0) and ('_' in raid_token) }}"
|
||||
|
||||
- name: Fail if MD device not found in /proc/mdstat # English comments
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- raid_present
|
||||
fail_msg: "RAID {{ raid_md }} not found in /proc/mdstat on VM {{ vm_ip }}."
|
||||
success_msg: "RAID {{ raid_md }} found in /proc/mdstat."
|
||||
- raid_token | length > 0
|
||||
fail_msg: "RAID device {{ raid_md_device }} was not found in /proc/mdstat on VM ({{ vm_ip }})."
|
||||
success_msg: "RAID device {{ raid_md_device }} found in /proc/mdstat."
|
||||
changed_when: false
|
||||
|
||||
- name: Fail if RAID is degraded (missing member) # English comments
|
||||
- name: Fail if RAID is degraded (token contains '_') # English comments
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- not raid_is_degraded
|
||||
fail_msg: >-
|
||||
RAID {{ raid_md }} is DEGRADED: status={{ raid_status }}.
|
||||
mdstat excerpt={{ (raid_block | trim) }}
|
||||
success_msg: "RAID {{ raid_md }} is OK: status={{ raid_status }}."
|
||||
- not raid_degraded
|
||||
fail_msg: "RAID {{ raid_md_device }} is DEGRADED: token=[{{ raid_token }}] (expected all 'U')."
|
||||
success_msg: "RAID {{ raid_md_device }} is OK: token=[{{ raid_token }}]."
|
||||
changed_when: false
|
||||
|
||||
- name: Fail if RAID rebuild/resync is in progress (optional) # English comments
|
||||
- name: Fail if RAID is syncing and syncing is not allowed # English comments
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- not raid_is_rebuilding
|
||||
fail_msg: >-
|
||||
RAID {{ raid_md }} is rebuilding/resyncing.
|
||||
{{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }}
|
||||
line={{ (raid_action_line | default('n/a', true)) }}
|
||||
success_msg: "RAID {{ raid_md }} is not rebuilding/resyncing."
|
||||
when: FAIL_ON_RESYNC | int == 1
|
||||
- (raid_allow_sync | int) == 1 or (not raid_syncing)
|
||||
fail_msg: "RAID {{ raid_md_device }} is running a sync operation (resync/recovery/reshape/check/repair) and RAID_ALLOW_SYNC=0."
|
||||
success_msg: "No sync operation detected (or RAID_ALLOW_SYNC=1)."
|
||||
changed_when: false
|
||||
|
||||
- name: Fail if RAID check is in progress (optional) # English comments
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- not raid_is_checking
|
||||
fail_msg: >-
|
||||
RAID {{ raid_md }} is running a periodic check.
|
||||
{{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }}
|
||||
line={{ (raid_action_line | default('n/a', true)) }}
|
||||
success_msg: "RAID {{ raid_md }} is not running a periodic check."
|
||||
when: FAIL_ON_CHECK | int == 1
|
||||
changed_when: false
|
||||
- name: Print concise summary (debug) # English comments
|
||||
ansible.builtin.debug:
|
||||
msg: >-
|
||||
RAID={{ raid_md_device }},
|
||||
token=[{{ raid_token }}],
|
||||
degraded={{ raid_degraded }},
|
||||
syncing={{ raid_syncing }},
|
||||
allow_sync={{ raid_allow_sync }}
|
||||
when: DEBUG == 1
|
||||
|
||||
Reference in New Issue
Block a user