This commit is contained in:
martin.fencl
2025-12-23 22:57:47 +01:00
parent a7d50a8f36
commit 1c6fb9c9c3
2 changed files with 185 additions and 227 deletions

View File

@@ -1,7 +1,7 @@
# check_raid.yml # check_raid.yml
- name: Check mdadm RAID status on VM (via SSH from proxmox_nextcloud) - name: Check Linux MD RAID health on VM via Proxmox
hosts: proxmox_nextcloud hosts: linux_servers
gather_facts: false gather_facts: false
become: true become: true
become_user: root become_user: root
@@ -14,17 +14,19 @@
vm_pass: "{{ lookup('env', 'VM_PASS') }}" vm_pass: "{{ lookup('env', 'VM_PASS') }}"
use_sudo: false use_sudo: false
# Debug / behavior toggles (controlled via Semaphore variables) # Debug / retries
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
RETRIES: "{{ lookup('env', 'RETRIES') | default(10) | int }}"
# Which md device to check (md0, md1, ... or /dev/md0) # RAID device to check (e.g. md0, md1...)
RAID_DEVICE: "{{ lookup('env', 'RAID_DEVICE') | default('md0', true) }}" raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}"
# Fail the job if rebuild/resync is in progress (recommended) # If 0 => fail when resync/recovery/reshape/check/repair is detected in /proc/mdstat
FAIL_ON_RESYNC: "{{ lookup('env', 'FAIL_ON_RESYNC') | default(1) | int }}" # If 1 => allow sync operations (still fails only on degraded [U_], [_U], etc.)
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}"
# Fail the job if a periodic "check" is running (usually set 0 to avoid noise) # SSH options
FAIL_ON_CHECK: "{{ lookup('env', 'FAIL_ON_CHECK') | default(0) | int }}" ssh_connect_timeout: 15
tasks: tasks:
- name: Ensure sshpass is installed (for password-based SSH) # English comments - name: Ensure sshpass is installed (for password-based SSH) # English comments
@@ -32,185 +34,84 @@
name: sshpass name: sshpass
state: present state: present
update_cache: yes update_cache: yes
cache_valid_time: 86400
- name: Read /proc/mdstat from VM (via SSH) # English comments - name: Read /proc/mdstat from VM (via SSH) # English comments
ansible.builtin.command: ansible.builtin.command:
argv: argv:
- timeout
- 25s
- sshpass - sshpass
- -e - -e
- ssh - ssh
- -o - -o
- StrictHostKeyChecking=no - StrictHostKeyChecking=no
- -o - -o
- UserKnownHostsFile=/dev/null - "ConnectTimeout={{ ssh_connect_timeout }}"
- -o
- ConnectTimeout=10
- -o
- ConnectionAttempts=1
- -o
- NumberOfPasswordPrompts=1
- -o
- PubkeyAuthentication=no
- -o
- GSSAPIAuthentication=no
- -o
- PasswordAuthentication=yes
- -o
- ServerAliveInterval=5
- -o
- ServerAliveCountMax=2
- "{{ vm_user }}@{{ vm_ip }}" - "{{ vm_user }}@{{ vm_ip }}"
- "cat /proc/mdstat" - bash
- -lc
- "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}"
environment: environment:
SSHPASS: "{{ vm_pass }}" SSHPASS: "{{ vm_pass }}"
register: mdstat_cmd register: mdstat_raw
changed_when: false changed_when: false
failed_when: false
no_log: "{{ DEBUG == 0 }}" no_log: "{{ DEBUG == 0 }}"
retries: "{{ RETRIES }}"
delay: 2
until: mdstat_raw.rc == 0
- name: Fail if we cannot read /proc/mdstat (SSH/auth/network) # English comments - name: Debug | Show raw /proc/mdstat # English comments
ansible.builtin.assert:
that:
- mdstat_cmd.rc == 0
fail_msg: >-
Cannot read /proc/mdstat from {{ vm_ip }} (rc={{ mdstat_cmd.rc }}).
stderr={{ (mdstat_cmd.stderr | default('') | trim) }}
success_msg: "Successfully read /proc/mdstat from {{ vm_ip }}."
changed_when: false
- name: Build base variables # English comments
ansible.builtin.set_fact:
raid_md: "{{ RAID_DEVICE | regex_replace('^/dev/', '') }}"
mdstat_text: "{{ mdstat_cmd.stdout | default('') }}"
changed_when: false
- name: Extract selected md block from mdstat (no regex) # English comments
ansible.builtin.set_fact:
raid_block: >-
{%- set ns = namespace(block='') -%}
{%- for b in (mdstat_text.split('\n\n')) -%}
{%- set bb = (b | trim) -%}
{%- if bb.startswith(raid_md ~ ' :') or bb.startswith(raid_md ~ ':') -%}
{%- set ns.block = bb -%}
{%- endif -%}
{%- endfor -%}
{{ ns.block }}
changed_when: false
- name: Parse RAID status from mdstat (no regex) # English comments
ansible.builtin.set_fact:
raid_present: "{{ (raid_block | trim | length) > 0 }}"
# Extract [UU] / [U_] token by scanning bracket tokens and keeping only U/_ chars
raid_status: >-
{%- set ns = namespace(st='') -%}
{%- for line in (raid_block.splitlines() if (raid_block | length) > 0 else []) -%}
{%- for tok in line.split() -%}
{%- if tok.startswith('[') and tok.endswith(']') -%}
{%- set inner = tok[1:-1] -%}
{%- if (inner | length) > 0 and ((inner | list | difference(['U','_'])) | length == 0) -%}
{%- set ns.st = inner -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}
{{ ns.st }}
raid_is_degraded: "{{ (raid_status | length > 0) and ('_' in raid_status) }}"
raid_block_lower: "{{ raid_block | lower }}"
raid_is_rebuilding: >-
{{
('resync' in raid_block_lower) or
('recovery' in raid_block_lower) or
('reshape' in raid_block_lower) or
('repair' in raid_block_lower)
}}
raid_is_checking: "{{ 'check' in raid_block_lower }}"
# First line that contains an action keyword
raid_action_line: >-
{%- set ns = namespace(line='') -%}
{%- set keys = ['resync','recovery','reshape','repair','check'] -%}
{%- for line in (raid_block.splitlines() if (raid_block | length) > 0 else []) -%}
{%- set l = (line | lower) -%}
{%- for k in keys -%}
{%- if ns.line == '' and (k in l) -%}
{%- set ns.line = (line | trim) -%}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}
{{ ns.line }}
# Parse progress from ".... = 12.3%" if present
raid_progress: >-
{%- set ns = namespace(p='') -%}
{%- if (raid_action_line | length) > 0 and ('=' in raid_action_line) and ('%' in raid_action_line) -%}
{%- set right = raid_action_line.split('=', 1)[1] -%}
{%- set ns.p = (right.split('%', 1)[0] | trim) -%}
{%- endif -%}
{{ ns.p }}
changed_when: false
- name: Debug | Show mdstat and parsed values # English comments
ansible.builtin.debug: ansible.builtin.debug:
msg: | msg: "{{ mdstat_raw.stdout }}"
--- /proc/mdstat ---
{{ mdstat_text }}
--- md block ({{ raid_md }}) ---
{{ raid_block }}
--- Parsed ---
raid_present={{ raid_present }}
raid_status={{ raid_status }}
raid_is_degraded={{ raid_is_degraded }}
raid_is_rebuilding={{ raid_is_rebuilding }}
raid_is_checking={{ raid_is_checking }}
raid_progress={{ raid_progress }}
raid_action_line={{ raid_action_line }}
when: DEBUG == 1 when: DEBUG == 1
- name: Fail if RAID device is not present # English comments - name: Extract RAID status token for selected MD device # English comments
ansible.builtin.set_fact:
raid_token: >-
{{
(mdstat_raw.stdout | regex_search(
raid_md_device ~ '\\s*:\\s*active.*\\n\\s*\\d+\\s+blocks.*\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]',
'\\1',
multiline=True
)) | default('')
}}
- name: Detect sync operations in mdstat (resync/recovery/reshape/check/repair) # English comments
ansible.builtin.set_fact:
raid_syncing: "{{ (mdstat_raw.stdout is search('resync|recovery|reshape|check|repair')) | bool }}"
- name: Compute degraded flag (underscore means missing member) # English comments
ansible.builtin.set_fact:
raid_degraded: "{{ (raid_token | length > 0) and ('_' in raid_token) }}"
- name: Fail if MD device not found in /proc/mdstat # English comments
ansible.builtin.assert: ansible.builtin.assert:
that: that:
- raid_present - raid_token | length > 0
fail_msg: "RAID {{ raid_md }} not found in /proc/mdstat on VM {{ vm_ip }}." fail_msg: "RAID device {{ raid_md_device }} was not found in /proc/mdstat on VM ({{ vm_ip }})."
success_msg: "RAID {{ raid_md }} found in /proc/mdstat." success_msg: "RAID device {{ raid_md_device }} found in /proc/mdstat."
changed_when: false changed_when: false
- name: Fail if RAID is degraded (missing member) # English comments - name: Fail if RAID is degraded (token contains '_') # English comments
ansible.builtin.assert: ansible.builtin.assert:
that: that:
- not raid_is_degraded - not raid_degraded
fail_msg: >- fail_msg: "RAID {{ raid_md_device }} is DEGRADED: token=[{{ raid_token }}] (expected all 'U')."
RAID {{ raid_md }} is DEGRADED: status={{ raid_status }}. success_msg: "RAID {{ raid_md_device }} is OK: token=[{{ raid_token }}]."
mdstat excerpt={{ (raid_block | trim) }}
success_msg: "RAID {{ raid_md }} is OK: status={{ raid_status }}."
changed_when: false changed_when: false
- name: Fail if RAID rebuild/resync is in progress (optional) # English comments - name: Fail if RAID is syncing and syncing is not allowed # English comments
ansible.builtin.assert: ansible.builtin.assert:
that: that:
- not raid_is_rebuilding - (raid_allow_sync | int) == 1 or (not raid_syncing)
fail_msg: >- fail_msg: "RAID {{ raid_md_device }} is running a sync operation (resync/recovery/reshape/check/repair) and RAID_ALLOW_SYNC=0."
RAID {{ raid_md }} is rebuilding/resyncing. success_msg: "No sync operation detected (or RAID_ALLOW_SYNC=1)."
{{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }}
line={{ (raid_action_line | default('n/a', true)) }}
success_msg: "RAID {{ raid_md }} is not rebuilding/resyncing."
when: FAIL_ON_RESYNC | int == 1
changed_when: false changed_when: false
- name: Fail if RAID check is in progress (optional) # English comments - name: Print concise summary (debug) # English comments
ansible.builtin.assert: ansible.builtin.debug:
that: msg: >-
- not raid_is_checking RAID={{ raid_md_device }},
fail_msg: >- token=[{{ raid_token }}],
RAID {{ raid_md }} is running a periodic check. degraded={{ raid_degraded }},
{{ 'progress=' ~ raid_progress ~ '%; ' if (raid_progress | length) > 0 else '' }} syncing={{ raid_syncing }},
line={{ (raid_action_line | default('n/a', true)) }} allow_sync={{ raid_allow_sync }}
success_msg: "RAID {{ raid_md }} is not running a periodic check." when: DEBUG == 1
when: FAIL_ON_CHECK | int == 1
changed_when: false

View File

@@ -1,6 +1,6 @@
# check_raid.yml # nextcloud/update_collabora.yml
- name: Check Linux MD RAID health on VM via Proxmox - name: Update Collabora CODE on VM via Proxmox
hosts: linux_servers hosts: linux_servers
gather_facts: false gather_facts: false
become: true become: true
@@ -8,25 +8,33 @@
become_method: sudo become_method: sudo
vars: vars:
# VM connection (provided by Semaphore env vars) # --- Connection to VM (provided by Semaphore env vars) ---
vm_ip: "{{ lookup('env', 'VM_IP') }}" vm_ip: "{{ lookup('env', 'VM_IP') }}"
vm_user: "{{ lookup('env', 'VM_USER') }}" vm_user: "{{ lookup('env', 'VM_USER') }}"
vm_pass: "{{ lookup('env', 'VM_PASS') }}" vm_pass: "{{ lookup('env', 'VM_PASS') }}"
use_sudo: false use_sudo: false
# Debug / retries # --- Debug mode (controlled via Semaphore variable) ---
DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}" DEBUG: "{{ lookup('env', 'DEBUG') | default(0) | int }}"
RETRIES: "{{ lookup('env', 'RETRIES') | default(10) | int }}" RETRIES: "{{ lookup('env', 'RETRIES') | default(25) | int }}"
# RAID device to check (e.g. md0, md1...) # --- Collabora specifics ---
raid_md_device: "{{ lookup('env', 'RAID_MD') | default('md0', true) }}" collabora_debug_caps: true
collabora_caps_url: "https://collabora.martinfencl.eu/hosting/capabilities"
# If 0 => fail when resync/recovery/reshape/check/repair is detected in /proc/mdstat # Use the FULL Nextcloud stack compose file; only target the 'collabora' service inside it
# If 1 => allow sync operations (still fails only on degraded [U_], [_U], etc.) collabora_project: "nextcloud-collabora"
raid_allow_sync: "{{ lookup('env', 'RAID_ALLOW_SYNC') | default(1, true) | int }}" collabora_compose_file: "/data/compose/nextcloud/nextcloud-collabora.yml"
collabora_service: "collabora"
# SSH options # Docker command prefix (consistent behavior and quiet hints)
ssh_connect_timeout: 15 docker_prefix: "unalias docker 2>/dev/null || true; DOCKER_CLI_HINTS=0; command docker"
# Commands to run on the target VM (quiet outputs)
collabora_commands:
- "{{ docker_prefix }} pull -q collabora/code:latest >/dev/null"
- "{{ docker_prefix }} compose -p {{ collabora_project }} -f {{ collabora_compose_file }} pull {{ collabora_service }} >/dev/null"
- "{{ docker_prefix }} compose -p {{ collabora_project }} -f {{ collabora_compose_file }} up -d --no-deps --force-recreate {{ collabora_service }} >/dev/null"
tasks: tasks:
- name: Ensure sshpass is installed (for password-based SSH) # English comments - name: Ensure sshpass is installed (for password-based SSH) # English comments
@@ -35,7 +43,7 @@
state: present state: present
update_cache: yes update_cache: yes
- name: Read /proc/mdstat from VM (via SSH) # English comments - name: Run Collabora update commands on VM (via SSH) # use SSHPASS env, hide item value
ansible.builtin.command: ansible.builtin.command:
argv: argv:
- sshpass - sshpass
@@ -44,74 +52,123 @@
- -o - -o
- StrictHostKeyChecking=no - StrictHostKeyChecking=no
- -o - -o
- "ConnectTimeout={{ ssh_connect_timeout }}" - ConnectTimeout=15
- "{{ vm_user }}@{{ vm_ip }}" - "{{ vm_user }}@{{ vm_ip }}"
- bash - bash
- -lc - -lc
- "{{ ('sudo ' if use_sudo else '') + 'cat /proc/mdstat' }}" - "{{ ('sudo ' if use_sudo else '') + item }}"
environment: environment:
SSHPASS: "{{ vm_pass }}" SSHPASS: "{{ vm_pass }}"
register: mdstat_raw loop: "{{ collabora_commands }}"
loop_control:
index_var: idx # <-- capture loop index here
label: "cmd-{{ idx }}" # <-- use idx instead of loop.index
register: collab_cmds
changed_when: false changed_when: false
no_log: "{{ DEBUG == 0 }}" no_log: "{{ DEBUG == 0 }}"
- name: Show outputs for each Collabora command
ansible.builtin.debug:
msg: |
CMD: {{ item.item }}
RC: {{ item.rc }}
STDOUT:
{{ (item.stdout | default('')).strip() }}
STDERR:
{{ (item.stderr | default('')).strip() }}
loop: "{{ collab_cmds.results }}"
when: DEBUG == 1
- name: Fail play if any Collabora command failed # also hide item label
ansible.builtin.assert:
that: "item.rc == 0"
fail_msg: "Collabora update failed on VM: {{ item.item }} (rc={{ item.rc }})"
success_msg: "All Collabora update commands succeeded."
loop: "{{ collab_cmds.results }}"
loop_control:
index_var: idx
label: "cmd-{{ idx }}"
# -------------------------
# Readiness checks (controller first, then VM fallback)
# -------------------------
- name: Collabora | Wait for capabilities (controller first)
ansible.builtin.uri:
url: "{{ collabora_caps_url }}"
method: GET
return_content: true
validate_certs: true
status_code: 200
register: caps_controller
delegate_to: localhost
run_once: true
retries: "{{ RETRIES }}" retries: "{{ RETRIES }}"
delay: 2 delay: 2
until: mdstat_raw.rc == 0 until: caps_controller.status == 200
failed_when: false
changed_when: false
- name: Debug | Show raw /proc/mdstat # English comments - name: Collabora | VM-side fetch (pure JSON via Python) # use SSHPASS env here too
ansible.builtin.debug: ansible.builtin.command:
msg: "{{ mdstat_raw.stdout }}" argv:
when: DEBUG == 1 - sshpass
- -e
- ssh
- -o
- StrictHostKeyChecking=no
- -o
- ConnectTimeout=15
- "{{ vm_user }}@{{ vm_ip }}"
- bash
- -lc
- |
python3 - <<'PY'
import json, urllib.request, sys
try:
with urllib.request.urlopen("{{ collabora_caps_url }}", timeout=15) as r:
sys.stdout.write(r.read().decode())
except Exception:
pass
PY
environment:
SSHPASS: "{{ vm_pass }}"
register: caps_vm
changed_when: false
failed_when: false
when: caps_controller.status | default(0) != 200 or caps_controller.json is not defined
no_log: "{{ DEBUG == 0 }}"
- name: Extract RAID status token for selected MD device # English comments - name: Collabora | Choose JSON (controller wins, else VM)
ansible.builtin.set_fact: ansible.builtin.set_fact:
raid_token: >- collab_caps_json: >-
{{ {{
(mdstat_raw.stdout | regex_search( (caps_controller.json
raid_md_device ~ '\\s*:\\s*active.*\\n\\s*\\d+\\s+blocks.*\\[[0-9]+/[0-9]+\\]\\s*\\[([U_]+)\\]', if (caps_controller.status|default(0))==200 and (caps_controller.json is defined)
'\\1', else (
multiline=True (caps_vm.stdout | default('') | trim | length > 0)
)) | default('') | ternary((caps_vm.stdout | trim | from_json), omit)
)
)
}} }}
failed_when: false
- name: Detect sync operations in mdstat (resync/recovery/reshape/check/repair) # English comments - name: Collabora | Print concise summary
ansible.builtin.set_fact:
raid_syncing: "{{ (mdstat_raw.stdout is search('resync|recovery|reshape|check|repair')) | bool }}"
- name: Compute degraded flag (underscore means missing member) # English comments
ansible.builtin.set_fact:
raid_degraded: "{{ (raid_token | length > 0) and ('_' in raid_token) }}"
- name: Fail if MD device not found in /proc/mdstat # English comments
ansible.builtin.assert:
that:
- raid_token | length > 0
fail_msg: "RAID device {{ raid_md_device }} was not found in /proc/mdstat on VM ({{ vm_ip }})."
success_msg: "RAID device {{ raid_md_device }} found in /proc/mdstat."
changed_when: false
- name: Fail if RAID is degraded (token contains '_') # English comments
ansible.builtin.assert:
that:
- not raid_degraded
fail_msg: "RAID {{ raid_md_device }} is DEGRADED: token=[{{ raid_token }}] (expected all 'U')."
success_msg: "RAID {{ raid_md_device }} is OK: token=[{{ raid_token }}]."
changed_when: false
- name: Fail if RAID is syncing and syncing is not allowed # English comments
ansible.builtin.assert:
that:
- (raid_allow_sync | int) == 1 or (not raid_syncing)
fail_msg: "RAID {{ raid_md_device }} is running a sync operation (resync/recovery/reshape/check/repair) and RAID_ALLOW_SYNC=0."
success_msg: "No sync operation detected (or RAID_ALLOW_SYNC=1)."
changed_when: false
- name: Print concise summary (debug) # English comments
ansible.builtin.debug: ansible.builtin.debug:
msg: >- msg: >-
RAID={{ raid_md_device }}, Collabora {{ collab_caps_json.productVersion | default('?') }}
token=[{{ raid_token }}], ({{ collab_caps_json.productName | default('?') }}),
degraded={{ raid_degraded }}, convert-to.available={{ collab_caps_json['convert-to']['available'] | default('n/a') }},
syncing={{ raid_syncing }}, serverId={{ collab_caps_json.serverId | default('n/a') }}
allow_sync={{ raid_allow_sync }} when: collab_caps_json is defined and DEBUG == 1
when: DEBUG == 1
- name: Collabora | Capabilities unavailable (after retries)
ansible.builtin.debug:
msg: "Capabilities endpoint není dostupný ani po pokusech."
when: collab_caps_json is not defined and DEBUG == 1
# Optional full JSON (debug)
- name: Collabora | Full JSON (debug)
ansible.builtin.debug:
var: collab_caps_json
when: collabora_debug_caps and (collab_caps_json is defined) and DEBUG == 1