From ae0ccf4b7d18608a8a0f98d9a608c3a7e866b8d0 Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Fri, 14 Feb 2025 13:38:20 +0100
Subject: [PATCH 01/10] Allow upgrade to continue with yellow cluster health
 under certain condiditons

---
 .../tasks/elasticsearch-rolling-stop.yml      | 35 +++++++++++++++++--
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
index fe7a0948..ac6bb95f 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
@@ -33,8 +33,14 @@
   retries: 5
   delay: 30
 
-  # this step is key!!!  Don't restart more nodes
-  # until all shards have completed recovery
+  # this step is key!!!  Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards
+  #
+  # From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
+  ## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
+  ##
+  ## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
+  ##
+  ## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
 - name: Wait for cluster health to return to green
   ansible.builtin.uri:
     url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
@@ -43,10 +49,33 @@
     password: "{{ elasticstack_password.stdout }}"
     validate_certs: no
   register: response
-  until: "response.json.status == 'green'"
+  until: "response.json.status == 'green' or
+    ( response.json.status == 'yellow' and
+      response.json.relocating_shards == 0 and
+      response.json.initializing_shards == 0
+    )"
   retries: 50
   delay: 30
 
+# Extra safety in case we continune with a yellow cluster
+# Wait a short time, then check cluster status again
+- name: "Attempting to contune with yellow cluster health"
+  when: "response.json.status == 'yellow'"
+  block:
+  - name: "Cluster health yellow: Wait before verifying status"
+    ansible.builtin.pause:
+      seconds: 10
+
+  - name: "Cluster health yellow: Verify we can safely continue"
+    ansible.builtin.uri:
+      url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+      method: GET
+      user: elastic
+      password: "{{ elasticstack_password.stdout }}"
+      validate_certs: no
+    register: response1
+    failed_when: "response1.json.relocating_shards != 0 or response1.json.initializing_shards != 0"
+
 # Disabling shard allocation right after enabling it seems redundant. Please see above for details.
 - name: Disable shard allocation for the cluster
   ansible.builtin.uri:

From 30e182c97f2a1bab6677bb2e185d4e4b2bb0aeed Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Fri, 14 Feb 2025 13:40:05 +0100
Subject: [PATCH 02/10] Add test for upgrade

---
 molecule/elasticsearch_upgrade/converge.yml   | 32 +++++++++++++++++++
 molecule/elasticsearch_upgrade/molecule.yml   | 32 +++++++++++++++++++
 molecule/elasticsearch_upgrade/prepare.yml    | 15 +++++++++
 .../elasticsearch_upgrade/requirements.yml    |  3 ++
 4 files changed, 82 insertions(+)
 create mode 100644 molecule/elasticsearch_upgrade/converge.yml
 create mode 100644 molecule/elasticsearch_upgrade/molecule.yml
 create mode 100644 molecule/elasticsearch_upgrade/prepare.yml
 create mode 100644 molecule/elasticsearch_upgrade/requirements.yml

diff --git a/molecule/elasticsearch_upgrade/converge.yml b/molecule/elasticsearch_upgrade/converge.yml
new file mode 100644
index 00000000..0d699052
--- /dev/null
+++ b/molecule/elasticsearch_upgrade/converge.yml
@@ -0,0 +1,32 @@
+---
+# The workaround for arbitrarily named role directory is important because the git repo has one name and the role within it another
+# Found at: https://github.com/ansible-community/molecule/issues/1567#issuecomment-436876722
+- name: Converge
+  collections:
+    - netways.elasticstack
+  hosts: all
+  vars:
+    #elasticsearch_security: true # needed for tests of > 7 releases
+    elasticstack_full_stack: false
+    elasticsearch_jna_workaround: true
+    elasticsearch_disable_systemcallfilterchecks: true
+    elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | int}}"
+    elasticsearch_heap: "1"
+    elasticstack_no_log: false
+  tasks:
+    - name: Include Elastics repos role
+      ansible.builtin.include_role:
+        name: repos
+    - name: Include Elasticsearch
+      ansible.builtin.include_role:
+        name: elasticsearch
+      vars: 
+        elasticstack_version: "8.16.0"
+      tags:
+        - molecule-idempotence-notest
+
+    - name: Include Elasticsearch
+      ansible.builtin.include_role:
+        name: elasticsearch
+      vars: 
+        elasticstack_version: "8.17.0"
diff --git a/molecule/elasticsearch_upgrade/molecule.yml b/molecule/elasticsearch_upgrade/molecule.yml
new file mode 100644
index 00000000..7c2c71b7
--- /dev/null
+++ b/molecule/elasticsearch_upgrade/molecule.yml
@@ -0,0 +1,32 @@
+---
+dependency:
+  name: galaxy
+  options:
+    requirements-file: requirements.yml
+driver:
+  name: docker
+platforms:
+  - name: elasticsearch_default1
+    groups:
+      - elasticsearch
+    image: "geerlingguy/docker-${MOLECULE_DISTRO:-debian11}-ansible:latest"
+    command: ${MOLECULE_DOCKER_COMMAND:-""}
+    volumes:
+      - /sys/fs/cgroup:/sys/fs/cgroup:rw
+    cgroupns_mode: host
+    privileged: true
+    pre_build_image: true
+  - name: elasticsearch_default2
+    groups:
+      - elasticsearch
+    image: "geerlingguy/docker-${MOLECULE_DISTRO:-debian11}-ansible:latest"
+    command: ${MOLECULE_DOCKER_COMMAND:-""}
+    volumes:
+      - /sys/fs/cgroup:/sys/fs/cgroup:rw
+    cgroupns_mode: host
+    privileged: true
+    pre_build_image: true
+provisioner:
+  name: ansible
+verifier:
+  name: ansible
diff --git a/molecule/elasticsearch_upgrade/prepare.yml b/molecule/elasticsearch_upgrade/prepare.yml
new file mode 100644
index 00000000..cf162407
--- /dev/null
+++ b/molecule/elasticsearch_upgrade/prepare.yml
@@ -0,0 +1,15 @@
+---
+- name: Prepare
+  hosts: all
+  tasks:
+    - name: Install packages for Debian
+      ansible.builtin.apt:
+        name:
+          - gpg
+          - gpg-agent
+          - procps
+          - curl
+          - iproute2
+          - git
+          - openssl
+        update_cache: yes
diff --git a/molecule/elasticsearch_upgrade/requirements.yml b/molecule/elasticsearch_upgrade/requirements.yml
new file mode 100644
index 00000000..8dd51618
--- /dev/null
+++ b/molecule/elasticsearch_upgrade/requirements.yml
@@ -0,0 +1,3 @@
+---
+collections:
+  - community.general

From d83d38f60647c2e0e9488cb32eae4a89ef520aa1 Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Tue, 11 Mar 2025 15:05:23 +0100
Subject: [PATCH 03/10] Avoid risky pipes. Increase retires to handle clusters
 with single frozen or cold datanode

---
 .../tasks/elasticsearch-rolling-start.yml     | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
index 8223cb74..efd9a781 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
@@ -36,20 +36,15 @@
     port: "{{ elasticstack_elasticsearch_http_port }}"
     delay: 30
 
-- name: Confirm the node joins the cluster # noqa: risky-shell-pipe
-  ansible.builtin.shell: >
-    if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
-    curl
-    -k
-    -u elastic:{{ elasticstack_password.stdout }}
-    -s
-    -m 2
-    '{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name'
-    | grep
-    -E
-    '^{{ elasticsearch_nodename }}$'
-  register: result
-  until: result.rc == 0
+- name: Confirm the node joins the cluster
+  ansible.builtin.uri:
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}//_cat/nodes?h=name&format=json"
+    method: GET
+    user: elastic
+    password: "{{ elasticstack_password.stdout }}"
+    validate_certs: no
+  register: response
+  until: response.json | json_query("[?name=='{{ elasticsearch_nodename }}']") | length > 0
   retries: 200
   delay: 3
   changed_when: false
@@ -79,5 +74,5 @@
     validate_certs: no
   register: response
   until: "response.json.status == 'yellow' or response.json.status == 'green'"
-  retries: 5
+  retries: 200
   delay: 30

From 01f48de8113969cddcf8f9f056802931e89540c8 Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Wed, 12 Mar 2025 13:35:07 +0100
Subject: [PATCH 04/10] Flush handlers before upgrade

---
 molecule/elasticsearch_upgrade/converge.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/molecule/elasticsearch_upgrade/converge.yml b/molecule/elasticsearch_upgrade/converge.yml
index 0d699052..68a1f8fa 100644
--- a/molecule/elasticsearch_upgrade/converge.yml
+++ b/molecule/elasticsearch_upgrade/converge.yml
@@ -20,13 +20,16 @@
     - name: Include Elasticsearch
       ansible.builtin.include_role:
         name: elasticsearch
-      vars: 
+      vars:
         elasticstack_version: "8.16.0"
       tags:
         - molecule-idempotence-notest
 
+    - name: Flush handlers before upgrade
+      meta: flush_handlers
+
     - name: Include Elasticsearch
       ansible.builtin.include_role:
         name: elasticsearch
-      vars: 
+      vars:
         elasticstack_version: "8.17.0"

From a4edc7ff307755f448b04ec085a183ee43d293c5 Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Wed, 12 Mar 2025 13:36:56 +0100
Subject: [PATCH 05/10] Avoid unsafe conditional error

---
 roles/elasticsearch/tasks/elasticsearch-rolling-start.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
index efd9a781..adefee1e 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
@@ -38,16 +38,19 @@
 
 - name: Confirm the node joins the cluster
   ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}//_cat/nodes?h=name&format=json"
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name&format=json"
     method: GET
     user: elastic
     password: "{{ elasticstack_password.stdout }}"
     validate_certs: no
   register: response
-  until: response.json | json_query("[?name=='{{ elasticsearch_nodename }}']") | length > 0
+  until: node_found | bool
   retries: 200
   delay: 3
   changed_when: false
+  vars:
+    node_found: "{{ response.json | json_query(node_query) | length > 0 }}"
+    node_query: "[?name=='{{ elasticsearch_nodename }}']"
 
 - name: Enable shard allocation for the cluster
   ansible.builtin.uri:

From 9a4be0deedde5a7d57b409b7a70b81cd5f61432e Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Wed, 12 Mar 2025 13:38:07 +0100
Subject: [PATCH 06/10] Only check for yellow cluster condition after we time
 out

---
 .../tasks/elasticsearch-rolling-stop.yml      | 85 +++++++++++--------
 1 file changed, 51 insertions(+), 34 deletions(-)

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
index ac6bb95f..b3733e4d 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
@@ -33,48 +33,65 @@
   retries: 5
   delay: 30
 
-  # this step is key!!!  Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards
-  #
-  # From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
-  ## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
-  ##
-  ## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
-  ##
-  ## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
-- name: Wait for cluster health to return to green
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-    method: GET
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  until: "response.json.status == 'green' or
-    ( response.json.status == 'yellow' and
-      response.json.relocating_shards == 0 and
-      response.json.initializing_shards == 0
-    )"
-  retries: 50
-  delay: 30
 
-# Extra safety in case we continune with a yellow cluster
-# Wait a short time, then check cluster status again
-- name: "Attempting to contune with yellow cluster health"
-  when: "response.json.status == 'yellow'"
-  block:
-  - name: "Cluster health yellow: Wait before verifying status"
-    ansible.builtin.pause:
-      seconds: 10
+#
+# Start cluster health check
+#
 
-  - name: "Cluster health yellow: Verify we can safely continue"
+# this step is key!!!  Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards
+#
+# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
+## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
+##
+## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
+##
+## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
+
+- name: Check cluster health
+  block:
+  - name: Wait for cluster health to return to green
     ansible.builtin.uri:
       url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
       method: GET
       user: elastic
       password: "{{ elasticstack_password.stdout }}"
       validate_certs: no
-    register: response1
-    failed_when: "response1.json.relocating_shards != 0 or response1.json.initializing_shards != 0"
+    register: response
+    until: "response.json.status == 'green'"
+    retries: 50
+    delay: 30
+
+  # Timed out while waiting for green cluster
+  # Check if we can continue with a yellow cluster
+  rescue:
+    - name: "Rescue: Check if cluster health is yellow"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
+
+    - name: "Rescure: Wait before verifying status"
+      ansible.builtin.pause:
+        seconds: 10
+
+    - name: "Rescue: Verify we can safely continue with yellow cluster"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
+
+#
+# End cluster health check
+#
+
 
 # Disabling shard allocation right after enabling it seems redundant. Please see above for details.
 - name: Disable shard allocation for the cluster

From 5adc31bdc97a59ceeef21ea12f527da809054dfa Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Mon, 17 Mar 2025 10:06:10 +0100
Subject: [PATCH 07/10] Avoid repeating tasks in multiple files. Make sure
 cluster health is OK before play continues from last node

---
 .../tasks/elasticsearch-rolling-start.yml     | 30 +------
 .../tasks/elasticsearch-rolling-stop.yml      | 80 +------------------
 .../elasticsearch-wait-for-cluster-health.yml | 66 +++++++++++++++
 3 files changed, 73 insertions(+), 103 deletions(-)
 create mode 100644 roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
index adefee1e..627df8da 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
@@ -52,30 +52,6 @@
     node_found: "{{ response.json | json_query(node_query) | length > 0 }}"
     node_query: "[?name=='{{ elasticsearch_nodename }}']"
 
-- name: Enable shard allocation for the cluster
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
-    method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
-    body_format: json
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  # next line is boolean not string, so no quotes around true
-  # use python truthiness
-  until: "response.json.acknowledged == true"
-  retries: 5
-  delay: 30
-
-- name: Wait for cluster health to return to yellow or green
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-    method: GET
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  until: "response.json.status == 'yellow' or response.json.status == 'green'"
-  retries: 200
-  delay: 30
+# Don't continue the play unless cluster health is OK
+- name: Cluster health check
+  ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
index b3733e4d..f98d0524 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
@@ -15,82 +15,10 @@
   ansible.builtin.set_fact:
     elasticsearch_http_protocol: "https"
 
-# Usually we should not need this step. It's only there to recover from broken upgrade plays
-# Without this step the cluster would never recover and the play would always fail
-- name: Enable shard allocation for the cluster
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
-    method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
-    body_format: json
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  # next line is boolean not string, so no quotes around true
-  # use python truthiness
-  until: "response.json.acknowledged == true"
-  retries: 5
-  delay: 30
-
-
-#
-# Start cluster health check
-#
-
-# this step is key!!!  Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards
-#
-# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
-## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
-##
-## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
-##
-## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
-
-- name: Check cluster health
-  block:
-  - name: Wait for cluster health to return to green
-    ansible.builtin.uri:
-      url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-      method: GET
-      user: elastic
-      password: "{{ elasticstack_password.stdout }}"
-      validate_certs: no
-    register: response
-    until: "response.json.status == 'green'"
-    retries: 50
-    delay: 30
-
-  # Timed out while waiting for green cluster
-  # Check if we can continue with a yellow cluster
-  rescue:
-    - name: "Rescue: Check if cluster health is yellow"
-      ansible.builtin.uri:
-        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-        method: GET
-        user: elastic
-        password: "{{ elasticstack_password.stdout }}"
-        validate_certs: no
-      register: response
-      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
-
-    - name: "Rescure: Wait before verifying status"
-      ansible.builtin.pause:
-        seconds: 10
-
-    - name: "Rescue: Verify we can safely continue with yellow cluster"
-      ansible.builtin.uri:
-        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-        method: GET
-        user: elastic
-        password: "{{ elasticstack_password.stdout }}"
-        validate_certs: no
-      register: response
-      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
-
-#
-# End cluster health check
-#
+# This step is here primarily in order to recover from broken/restarted upgrade or rolling restart.
+# TODO: Only run this task for the first host.
+- name: Cluster health check
+  ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
 
 
 # Disabling shard allocation right after enabling it seems redundant. Please see above for details.
diff --git a/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml b/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml
new file mode 100644
index 00000000..ebb7cf6c
--- /dev/null
+++ b/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml
@@ -0,0 +1,66 @@
+---
+
+# Make sure shard allocation is enabled
+- name: Enable shard allocation for the cluster
+  ansible.builtin.uri:
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
+    method: PUT
+    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
+    body_format: json
+    user: elastic
+    password: "{{ elasticstack_password.stdout }}"
+    validate_certs: no
+  register: response
+  # next line is boolean not string, so no quotes around true
+  # use python truthiness
+  until: "response.json.acknowledged == true"
+  retries: 5
+  delay: 30
+
+# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
+## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
+##
+## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
+##
+## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
+
+- name: Check cluster health
+  block:
+  - name: Wait for cluster health to return to green
+    ansible.builtin.uri:
+      url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+      method: GET
+      user: elastic
+      password: "{{ elasticstack_password.stdout }}"
+      validate_certs: no
+    register: response
+    until: "response.json.status == 'green'"
+    retries: 50
+    delay: 30
+
+  # Timed out while waiting for green cluster
+  # Check if we can continue with a yellow cluster
+  rescue:
+    - name: "Rescue: Check if cluster health is yellow"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
+
+    - name: "Rescure: Wait before verifying status"
+      ansible.builtin.pause:
+        seconds: 10
+
+    - name: "Rescue: Verify we can safely continue with yellow cluster"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"

From 275dea3e5827e2a495c1be9bc0e732b28c2de8ae Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Mon, 17 Mar 2025 15:42:41 +0100
Subject: [PATCH 08/10] Add option to stop ML jobs and set cluster allocation
 mode. Update docs

---
 docs/role-elasticsearch.md                        |  6 ++++++
 roles/elasticsearch/defaults/main.yml             |  4 ++++
 .../tasks/elasticsearch-rolling-start.yml         | 11 +++++++++++
 .../tasks/elasticsearch-rolling-stop.yml          | 15 +++++++++++++--
 .../tasks/elasticsearch-rolling-upgrade.yml       |  1 -
 5 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/docs/role-elasticsearch.md b/docs/role-elasticsearch.md
index 693dac09..00661a48 100644
--- a/docs/role-elasticsearch.md
+++ b/docs/role-elasticsearch.md
@@ -9,6 +9,9 @@ If you use the role to set up security you, can use its CA to create certificate
 
 Please note that setting `elasticsearch_bootstrap_pw` as variable will only take effect when initialising Elasticsearch. Changes after starting elasticsearch for the first time will not change the bootstrap password for the instance and will lead to breaking tests.
 
+The role can perform a rolling upgrade of an elasticsearch cluster. Nodes will be upgraded in inventory order. If you are using data tires, you're ansible inventory should be sorted acording to data tiers.  Set `any_errors_fatal: true` in your playbook to abort if any errors occur during the upgrade.
+
+
 Role Variables
 --------------
 
@@ -54,6 +57,9 @@ This variable activates a workaround to start on systems that have certain harde
 * *elasticsearch_seed_hosts*: Set elasticsearch seed hosts
 * *elasticsearch_security_enrollment*: Controls enrollment (of nodes and Kibana) to a local node that’s been autoconfigured for security.
 
+*elasticsearch_upgrade_routing_mode*: Set `cluster.routing.allocation.enable` during rolling upgrade. (default: `none`)
+*elasticsearch_upgrade_stop_ml*: Stop the tasks associated with active machine learning jobs and datafeeds during rolling upgrade. (default: `false`)
+
 The following variable was only integrated to speed up upgrades of non-production clusters. Use with caution and at your own risk:
 
 * *elasticsearch_unsafe_upgrade_restart*: This will still perform rolling upgrades, but will first update the package and then restart the service. In contrast the default behaviour is to stop the service, do the upgrade and then start again. (default: `false`)
diff --git a/roles/elasticsearch/defaults/main.yml b/roles/elasticsearch/defaults/main.yml
index 29aaa0c6..2303c4aa 100644
--- a/roles/elasticsearch/defaults/main.yml
+++ b/roles/elasticsearch/defaults/main.yml
@@ -37,6 +37,10 @@ elasticsearch_cert_expiration_buffer: 30
 elasticsearch_cert_will_expire_soon: false
 elasticsearch_ssl_verification_mode: full
 
+# Upgrade options
+elasticsearch_upgrade_routing_mode: "none"
+elasticsearch_upgrade_stop_ml: false
+
 # use this only for non-prod environments and at your own risk!
 elasticsearch_unsafe_upgrade_restart: false
 
diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
index 627df8da..bad16c1a 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
@@ -55,3 +55,14 @@
 # Don't continue the play unless cluster health is OK
 - name: Cluster health check
   ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
+
+- name: Restart ML jobs
+  ansible.builtin.uri:
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_ml/set_upgrade_mode?enabled=false"
+    method: POST
+    user: elastic
+    password: "{{ elasticstack_password.stdout }}"
+    validate_certs: no
+  failed_when: false
+  when:
+    - elasticsearch_upgrade_stop_ml | bool
diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
index f98d0524..3bf3719d 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
@@ -21,12 +21,23 @@
   ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
 
 
-# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
+- name: Stop ML jobs while upgrading
+  ansible.builtin.uri:
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_ml/set_upgrade_mode?enabled=true"
+    method: POST
+    user: elastic
+    password: "{{ elasticstack_password.stdout }}"
+    validate_certs: no
+  failed_when: false
+  run_once: true
+  when:
+    - elasticsearch_upgrade_stop_ml | bool
+
 - name: Disable shard allocation for the cluster
   ansible.builtin.uri:
     url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
     method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": "none" }}'
+    body: "{ \"persistent\": { \"cluster.routing.allocation.enable\": \"{{ elasticsearch_upgrade_routing_mode }}\" } }"
     body_format: json
     user: elastic
     password: "{{ elasticstack_password.stdout }}"
diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml
index ab319c07..d8d90e9e 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml
@@ -64,7 +64,6 @@
       notify:
         - Restart Elasticsearch
 
-
 - name: Be careful about upgrade when Elasticsearch is running
   when:
     - elasticsearch_running.status.ActiveState == "active"

From 65953ae753dfe70e624808c38bcf103fc4f4ae2d Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivareri@users.noreply.github.com>
Date: Tue, 18 Mar 2025 08:40:56 +0100
Subject: [PATCH 09/10] Formatting

---
 docs/role-elasticsearch.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/role-elasticsearch.md b/docs/role-elasticsearch.md
index 00661a48..d5f5fe89 100644
--- a/docs/role-elasticsearch.md
+++ b/docs/role-elasticsearch.md
@@ -57,8 +57,8 @@ This variable activates a workaround to start on systems that have certain harde
 * *elasticsearch_seed_hosts*: Set elasticsearch seed hosts
 * *elasticsearch_security_enrollment*: Controls enrollment (of nodes and Kibana) to a local node that’s been autoconfigured for security.
 
-*elasticsearch_upgrade_routing_mode*: Set `cluster.routing.allocation.enable` during rolling upgrade. (default: `none`)
-*elasticsearch_upgrade_stop_ml*: Stop the tasks associated with active machine learning jobs and datafeeds during rolling upgrade. (default: `false`)
+* *elasticsearch_upgrade_routing_mode*: Set `cluster.routing.allocation.enable` during rolling upgrade. (default: `none`)
+* *elasticsearch_upgrade_stop_ml*: Stop the tasks associated with active machine learning jobs and datafeeds during rolling upgrade. (default: `false`)
 
 The following variable was only integrated to speed up upgrades of non-production clusters. Use with caution and at your own risk:
 

From d7d6c7977a71a0902b992dca39246852c8c3f0d3 Mon Sep 17 00:00:00 2001
From: Ivar Eriksen <ivaeri@norkart.no>
Date: Wed, 19 Mar 2025 15:12:21 +0100
Subject: [PATCH 10/10] Revert "Avoid risky pipes. Increase retires to handle
 clusters with single frozen or cold datanode"

This reverts commit d83d38f60647c2e0e9488cb32eae4a89ef520aa1.
---
 .../tasks/elasticsearch-rolling-start.yml     | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
index bad16c1a..283cc882 100644
--- a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
+++ b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
@@ -36,21 +36,23 @@
     port: "{{ elasticstack_elasticsearch_http_port }}"
     delay: 30
 
-- name: Confirm the node joins the cluster
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name&format=json"
-    method: GET
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  until: node_found | bool
+- name: Confirm the node joins the cluster # noqa: risky-shell-pipe
+  ansible.builtin.shell: >
+    if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
+    curl
+    -k
+    -u elastic:{{ elasticstack_password.stdout }}
+    -s
+    -m 2
+    '{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name'
+    | grep
+    -E
+    '^{{ elasticsearch_nodename }}$'
+  register: result
+  until: result.rc == 0
   retries: 200
   delay: 3
   changed_when: false
-  vars:
-    node_found: "{{ response.json | json_query(node_query) | length > 0 }}"
-    node_query: "[?name=='{{ elasticsearch_nodename }}']"
 
 # Don't continue the play unless cluster health is OK
 - name: Cluster health check