apiVersion: v1 data: detect_gcomm_and_start.sh: | #!/bin/bash set -eu source /var/lib/operator-scripts/mysql_root_auth.sh URI_FILE=/var/lib/mysql/gcomm_uri rm -f /var/lib/mysql/mysql.sock rm -f $URI_FILE echo "Waiting for gcomm URI to be configured for this POD" while [ ! -f $URI_FILE ]; do sleep 2 done set -x URI=$(cat $URI_FILE) if [ "$URI" = "gcomm://" ]; then echo "this POD will now bootstrap a new galera cluster" if [ -f /var/lib/mysql/grastate.dat ]; then sed -i -e 's/^\(safe_to_bootstrap\):.*/\1: 1/' /var/lib/mysql/grastate.dat fi else echo "this POD will now join cluster $URI" fi rm -f $URI_FILE exec /usr/libexec/mysqld --wsrep-cluster-address="$URI" detect_last_commit.sh: | #!/bin/bash set -eu source /var/lib/operator-scripts/mysql_root_auth.sh # Adapted from clusterlab's galera resource agent recover_args="--datadir=/var/lib/mysql \ --user=mysql \ --skip-networking \ --wsrep-cluster-address=gcomm://localhost" recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p' recovered_position_uuid_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position\:\ \(.*\)\:.*$/\1/p' recovered_position_seqno_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p' grastate_file=/var/lib/mysql/grastate.dat gvwstate_file=/var/lib/mysql/gvwstate.dat uuid="" seqno="" safe_to_bootstrap=0 no_grastate=0 function json_summary { declare -a out if [ -n "$uuid" ]; then out+=( "\"uuid\":\"$uuid\"" ); fi if [ -n "$seqno" ]; then out+=( "\"seqno\":\"$seqno\"" ); fi if [ $safe_to_bootstrap -ne 0 ]; then out+=( '"safe_to_bootstrap":true' ); fi if [ $no_grastate -ne 0 ]; then out+=( '"no_grastate":true' ); fi IFS=, ; echo "{${out[*]}}" } trap json_summary EXIT # codership/galera#354 # Some ungraceful shutdowns can leave an empty gvwstate.dat on # disk. This will prevent galera to join the cluster if it is # configured to attempt PC recovery. Removing that file makes the # node fall back to the normal, unoptimized joining process. if [ -f $gvwstate_file ] && \ [ ! -s $gvwstate_file ]; then echo "empty $gvwstate_file detected, removing it to prevent PC recovery failure at next restart" >&2 rm -f $gvwstate_file fi # Attempt to retrieve the seqno information and safe_to_bootstrap hint # from the saved state file on disk if [ -f $grastate_file ]; then uuid="$(cat $grastate_file | sed -n 's/^uuid.\s*\(.*\)\s*$/\1/p')" seqno="$(cat $grastate_file | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" safe_to_bootstrap="$(cat $grastate_file | sed -n 's/^safe_to_bootstrap.\s*\(.*\)\s*$/\1/p')" if [ -z "$uuid" ] || \ [ "$uuid" = "00000000-0000-0000-0000-000000000000" ]; then safe_to_bootstrap=0 fi if [ "$safe_to_bootstrap" = "1" ]; then if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then safe_to_bootstrap=0 fi fi fi # If the seqno could not be retrieved, inspect the mysql database if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then tmp=$(mktemp) chown mysql:mysql $tmp # if we pass here because grastate.dat doesn't exist, report it if [ ! -f /var/lib/mysql/grastate.dat ]; then no_grastate=1 fi mysqld_safe --wsrep-recover $recover_args --log-error=$tmp >/dev/null seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)" uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)" if [ -z "$seqno" ]; then # Galera uses InnoDB's 2pc transactions internally. If # server was stopped in the middle of a replication, the # recovery may find a "prepared" XA transaction in the # redo log, and mysql won't recover automatically recovery_file="$(cat $tmp | sed -n $recovery_file_regex)" if [ -e $recovery_file ]; then cat $recovery_file | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null if [ $? -eq 0 ]; then # we can only rollback the transaction, but that's OK # since the DB will get resynchronized anyway echo "local node was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" >&2 mysqld_safe --wsrep-recover $recover_args \ --tc-heuristic-recover=rollback --log-error=$tmp >/dev/null 2>&1 seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)" uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)" if [ ! -z "$seqno" ]; then echo "State recovered. force SST at next restart for full resynchronization" >&2 rm -f /var/lib/mysql/grastate.dat # try not to bootstrap from this node if possible no_grastate=1 fi fi fi fi rm -f $tmp fi if [ -z "$seqno" ]; then echo "Unable to detect last known write sequence number" >&2 exit 1 fi # json data is printed on exit mysql_bootstrap.sh: | #!/bin/bash set +eux # prepare space for mysql_root_auth.sh to place pw cache file sudo mkdir -p /var/local/my.cnf sudo chown mysql:mysql /var/local/my.cnf # set up $DB_ROOT_PASSWORD. # disable my.cnf caching in mysql_root_auth.sh, so that we definitely # use the root password defined in the cluster. this should create # a new file in /var/local/my.cnf/ MYSQL_ROOT_AUTH_BYPASS_CHECKS=true source /var/lib/operator-scripts/mysql_root_auth.sh function kolla_update_db_root_pw { # update the root password given a set of mariadb datafiles # ported from kolla_extend_start with major changes # because galera controller generates a new root password if one was # not sent via pre-existing secret, the root pw has to be updated if # existing datafiles are present, as they will still store the previous # root pw which we by definition don't know what it is # to achieve this, we have to run our own mysqld. # before we do any of that, check if the socket file exists and we can log # in with the current password. this happens if we are running in a debug # container for example; sock file is there, works, lets us log in to # the mysqld running on the primary container. do nothing in that case # # TOPIC: Why does /var/lib/mysql/mysql.sock allow the debug container to # communicate with the primary container? # Having two containers communicate over a sockfile only works when both # containers are on the same host. "oc debug" will run from the same # host because: # # 1. https://bugzilla.redhat.com/show_bug.cgi?id=1806662 - "oc debug of pod # with host mounts should target the same node" # 2. ReadWriteOnce volumes enforce single-node mounting, and we mount with # RWO # (https://cookbook.openshift.org/storing-data-in-persistent-volumes/how-can-i-scale-applications-using-rwo-persistent-volumes.html) # # So it is functionally impossible for a container from another host to # mount this volume at the same time, we can be assured we're on the same # host in oc debug if we see these files. # if [[ -S /var/lib/mysql/mysql.sock ]]; then echo -e "Socket file /var/lib/mysql/mysql.sock exists, testing current root password" if timeout 5 mysql -u root -p"${DB_ROOT_PASSWORD}" -e "SELECT 1;" &>/dev/null; then echo -e "Successfully logged in via existing /var/lib/mysql/mysql.sock file with current root password, not attempting to change password" # hilarity avoided. trying to run another mysqld when another container # is running one against the same /var/lib/mysql fails to start, claiming # exclusive lock issues. these exclusive lock issues can only be # detected by reading the logfile of the failed mysqld process # (flock, fcntl, etc. cannot detect it). this is a slow process # that's best avoided if not needed anyway. return else echo -e "Could not log in with current password, proceeding with password reset" fi fi # OK there's no sockfile or we couldn't log in. let's do this # first, use log/pidfile on a non-mounted volume, so that we can see what # happens local to this container (which might be a debug container) # without leaking details from other processes that may share /var/lib/mysql CHANGE_PW_PIDFILE=/var/tmp/updatepw.pid CHANGE_PW_LOGFILE=/var/tmp/updatepw.log echo -e "Running with --skip-grant-tables to reset root password" rm -fv ${CHANGE_PW_PIDFILE} ${CHANGE_PW_LOGFILE} mysqld_safe --skip-grant-tables --wsrep-on=OFF --log-error=${CHANGE_PW_LOGFILE} --pid-file=${CHANGE_PW_PIDFILE} & # Wait for the mariadb server to be "Ready" before running root update commands # Querying the cluster status has to be executed after the existence of mysql.sock and mariadb.pid. ORIG_TIMEOUT=${DB_MAX_TIMEOUT:-60} TIMEOUT=${ORIG_TIMEOUT} while [[ ! -S /var/lib/mysql/mysql.sock ]] || \ [[ ! -f "${CHANGE_PW_PIDFILE}" ]]; do if [[ ${TIMEOUT} -gt 0 ]]; then let TIMEOUT-=1 sleep 1 else echo -e "Surpassed timeout of ${ORIG_TIMEOUT} without seeing a pidfile" echo -e "Dump of ${CHANGE_PW_LOGFILE}" cat ${CHANGE_PW_LOGFILE} exit 1 fi done echo -e "Refreshing root passwords" mysql -u root </var/lib/config-data/generated/galera.cnf [client] !includedir /var/local/my.cnf/ [mysqld] bind_address=localhost wsrep_provider=none EOF sudo -E kolla_set_configs kolla_extend_start fi # Generate the mariadb configs from the templates, these will get # copied by `kolla_start` when the pod's main container will start if [ "$(sysctl -n crypto.fips_enabled)" == "1" ]; then echo FIPS enabled SSL_CIPHER='ECDHE-RSA-AES256-GCM-SHA384' else SSL_CIPHER='AES128-SHA256' fi PODNAME=$(hostname -f | cut -d. -f1,2) PODIPV4=$(grep "${PODNAME}" /etc/hosts | grep -v ':' | cut -d$'\t' -f1) PODIPV6=$(grep "${PODNAME}" /etc/hosts | grep ':' | cut -d$'\t' -f1) cd /var/lib/config-data/default for cfg in *.cnf.in; do if [ -s "${cfg}" ]; then if [[ "" = "${PODIPV6}" ]]; then PODIP="${PODIPV4}" IPSTACK="IPV4" else PODIP="[::]" IPSTACK="IPV6" fi echo "Generating config file from template ${cfg}, will use ${IPSTACK} listen address of ${PODIP}" sed -e "s/{ PODNAME }/${PODNAME}/" -e "s/{ PODIP }/${PODIP}/" -e "s/{ SSL_CIPHER }/${SSL_CIPHER}/" "/var/lib/config-data/default/${cfg}" > "/var/lib/config-data/generated/${cfg%.in}" fi done mysql_probe.sh: | #!/bin/bash set -u source /var/lib/operator-scripts/mysql_root_auth.sh PROBE_USER=root MYSQL_SOCKET=/var/lib/mysql/mysql.sock SST_IN_PROGRESS=/var/lib/mysql/sst_in_progress CHECK_RETRY=10 CHECK_WAIT=0.5 STARTUP_WAIT=2 LAST_STATE="" function log_state { local state="$1" # do not duplicate error logs in the probe, to minimize the # output in k8s events in case the probe fails if [ "${LAST_STATE}" != "${state}" ]; then LAST_STATE="${state}" fi } function log_last_state { if [ -n "${LAST_STATE}" ]; then echo "${LAST_STATE}" fi } trap log_last_state EXIT function get_mysql_status { local status=$1 local i local out for i in $(seq $CHECK_RETRY); do out=$(mysql -u${PROBE_USER} -sNEe "show status like '${status}';" 2>&1) if [ $? -eq 0 ]; then echo "${out}" | tail -1 return 0 else sleep ${CHECK_WAIT} fi done # if we pass here, log the last error from mysql echo "${out}" >&2 return 1 } function check_mysql_status { local status=$1 local expect=$2 local val local rc val=$(get_mysql_status "${status}") test "${val}" = "${expect}" rc=$? if [ $rc -ne 0 ]; then log_state "${status} (${val}) differs from ${expect}" fi return $rc } function check_sst_in_progress { local i # retry to give some time to mysql to set up the SST for i in $(seq $CHECK_RETRY); do if [ -e ${MYSQL_SOCKET} ]; then return 1 elif [ -e ${SST_IN_PROGRESS} ]; then return 0 else sleep ${CHECK_WAIT} fi done return 1 } function check_mysql_ready { local i # retry to give some time to mysql to create its socket for i in $(seq $CHECK_RETRY); do if [ -e ${MYSQL_SOCKET} ] && mysqladmin -s -u${PROBE_USER} ping >dev/null; then return 0 else sleep ${CHECK_WAIT} fi done return 1 } # Monitor the startup sequence until the galera node is connected # to a primary component and synced # NOTE: as of mariadb 10.5, if mysql connects to a non-primary # partition, it never creates any socket and gets stuck indefinitely. # In that case, in order to not wait until the startup times out # (very long), we error out of the probe so that the pod can restart # and mysql reconnect to a primary partition if possible. function check_mysql_startup { # mysql initialization sequence: # . mysql connects to a remote galera node over port 4567 # . mysql optionally runs a SST (port 4444), SST marker created on disk # . only at this point, InnoDB is initialized, mysql pidfile and # mysql socket are created on disk if pgrep -f detect_gcomm_and_start.sh >/dev/null ; then log_state "waiting for gcomm URI" return 1 fi # pidfile is not written on disk until mysql is ready, # so look for the mysqld process instead if ! pgrep -f /usr/libexec/mysqld >/dev/null ; then log_state "waiting for mysql to start" return 1 fi # a bootstrap node must be reachable from the CLI to finish startup if pgrep -f -- '--wsrep-cluster-address=gcomm://(\W|$)' >/dev/null; then check_mysql_ready return $? # a joiner node must have an established socket connection before testing further elif pgrep -f -- '--wsrep-cluster-address=gcomm://\w' >/dev/null; then local connections connections=$(ss -tnH state established src :4567 or dst :4567 | wc -l) if ! test "${connections}" -ge 0; then log_state "waiting for mysql to join a galera cluster" return 1 fi else log_state "could not determine galera startup mode" exit 1 fi # a joiner node requires additional startup checks if [ -e /var/lib/mysql/mysql.sock ]; then # good case, mysql is ready to be probed from the CLI # check WSREP status like the regular liveness probe local status local comment status=$(get_mysql_status wsrep_cluster_status) comment=$(get_mysql_status wsrep_local_state_comment) if [ "${status}" = "Primary" -a "${comment}" = "Synced" ]; then return 0 elif [ "${status}" = "Primary" ]; then log_state "waiting to be synced with the cluster" return 1 elif [ "${status}" = "Non-primary" -a "${comment}" = "Synced"]; then log_state "mysql is connected to a non-primary partition, server stopped" exit 1 else log_state "waiting for connection to a primary partition" return 1 fi else # if there is no socket, mysql may be running an SST... if check_sst_in_progress; then log_state "waiting for SST to finish" return 1 fi # ... if no SST was detected, it may have finished before # we probed it. Check a last time whether we can connect to mysql if check_mysql_ready; then return 0 fi # At this stage, mysql is either trying to connect to a boostrap node # that resolved to an old pod IP, or it is is connected to a # non-primary partition. Either way, this is not recoverable, so # make the probe fail and let k8s kill the mysql server. log_state "could not find a primary partition to connect to" exit 1 fi return 1 } # startup probe loops until the node started or joined a galera cluster # readiness and liveness probes are run by k8s only after start probe succeeded case "$1" in startup) if [ -z "$2" ]; then echo "startup timeout option missing" exit 1 fi TIME_TIMEOUT=$2 # Run the entire check in a single startup probe to avoid spurious # "Unhealthy" k8s events to be logged. The probe stops in error # if the startup timeout is reached rc=1 while [ $rc -ne 0 ]; do if check_mysql_startup; then exit 0 else sleep ${STARTUP_WAIT}; [ $SECONDS -ge $TIME_TIMEOUT ] && exit 1 fi done exit $rc ;; readiness) # If the node is e.g. a donor, it cannot serve traffic check_mysql_status wsrep_local_state_comment Synced ;; liveness) # If the node is not in the primary partition, the failed liveness probe # will make k8s restart this pod check_mysql_status wsrep_cluster_status Primary ;; *) echo "Invalid probe option '$1'" exit 1;; esac mysql_root_auth.sh: | #!/bin/bash set +eu POD_NAME=$(hostname) # API server config APISERVER=https://kubernetes.default.svc SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) TOKEN=$(cat ${SERVICEACCOUNT}/token) CACERT=${SERVICEACCOUNT}/ca.crt K8S_API="api/v1" MARIADB_API="apis/mariadb.openstack.org/v1beta1" # default empty value when the env doesn't specify it : ${GALERA_INSTANCE=openstack-cell1} PW_CACHE_FILE="/var/local/my.cnf/mysql_pw_cache.cnf" MYSQL_SOCKET=/var/lib/mysql/mysql.sock CREDENTIALS_CHECK_TIMEOUT=4 # Set up connection parameters based on whether we're connecting remotely or locally # default empty value when the env doesn't specify it : ${MYSQL_CONN_PARAMS=} if [ -n "${MYSQL_REMOTE_HOST}" ]; then MYSQL_CONN_PARAMS="${MYSQL_CONN_PARAMS} -h ${MYSQL_REMOTE_HOST} -P 3306" USE_SOCKET=false else USE_SOCKET=true fi # Check if we have cached credentials if [ "${MYSQL_ROOT_AUTH_BYPASS_CHECKS}" != "true" ] && [ -f "${PW_CACHE_FILE}" ]; then # Read the password from .my.cnf PASSWORD=$(grep '^password=********** # Validate credentials if MySQL is accessible if [ -n "${PASSWORD}" ]; then # For local connections, check if socket exists; for remote, always try SHOULD_VALIDATE=false if [ "${USE_SOCKET}" = "true" ]; then if [ -S "${MYSQL_SOCKET}" ]; then SHOULD_VALIDATE=true fi else # Remote connection - always validate SHOULD_VALIDATE=true fi credentials_check=1 if [ "${SHOULD_VALIDATE}" = "true" ]; then timeout ${CREDENTIALS_CHECK_TIMEOUT} mysql ${MYSQL_CONN_PARAMS} -uroot -p"${PASSWORD}" -e "SELECT 1;" >/dev/null 2>&1 credentials_check=$? fi if [ "${SHOULD_VALIDATE}" = "true" ] && [ $credentials_check -eq 124 ]; then # MySQL validation timed out, assume cache is valid and will be validated on next probe export MYSQL_PWD="${PASSWORD}" export DB_ROOT_PASSWORD="${PASSWORD}" return 0 elif [ "${SHOULD_VALIDATE}" = "true" ] && [ $credentials_check -eq 0 ]; then # Credentials are still valid, use cached values export MYSQL_PWD="${PASSWORD}" export DB_ROOT_PASSWORD="${PASSWORD}" return 0 elif [ "${USE_SOCKET}" = "true" ] && [ ! -S "${MYSQL_SOCKET}" ]; then # MySQL not running locally, assume cache is valid and will be validated on next probe export MYSQL_PWD="${PASSWORD}" export DB_ROOT_PASSWORD="${PASSWORD}" return 0 fi fi # If we get here, credentials are invalid, fall through to refresh fi # Get the Galera CR GALERA_CR=$(curl -s \ --cacert ${CACERT} \ --header "Content-Type:application/json" \ --header "Authorization: Bearer ${TOKEN}" \ "${APISERVER}/${MARIADB_API}/namespaces/${NAMESPACE}/galeras/${GALERA_INSTANCE}") # note jq is not installed in the galera image, macgyvering w/ python instead SECRET_NAME=$(echo "${GALERA_CR}" | python3 -c "import json, sys; print(json.load(sys.stdin)['status']['rootDatabaseSecret'])") # get password from secret PASSWORD=$(curl -s \ --cacert ${CACERT} \ --header "Content-Type:application/json" \ --header "Authorization: Bearer ${TOKEN}" \ "${APISERVER}/${K8S_API}/namespaces/${NAMESPACE}/secrets/${SECRET_NAME}" \ | python3 -c "import json, sys; print(json.load(sys.stdin)['data']['DatabasePassword'])" \ | base64 -d) # Special step for the unlikely case that root PW is being changed but the # account.sh script failed to complete. Test this password (which came from # galera->Status->rootDatabaseSecret) and if not working, see if there is a # different (newer) password in root galera->Spec->rootDatabaseAccount->Secret, # and try that. This suits the case where a new password was placed in # galera->Spec->rootDatabaseAccount->Secret, account.sh ran to update the root # password, but failed to complete, even though the actual password got # updated. account.sh will run again on a new pod but the password that's in # galera->Status->rootDatabaseSecret is no longer valid, and would prevent # account.sh from proceeding a second time. Try the "pending" password just to # get through, so that account.sh can succeed and # galera->Status->rootDatabaseSecret can then be updated. PASSWORD_VALID=true # test password with mysql command if socket exists, or we are remote if [ "${MYSQL_ROOT_AUTH_BYPASS_CHECKS}" != "true" ] && { [ "${USE_SOCKET}" = "false" ] || [ -S "${MYSQL_SOCKET}" ]; }; then if ! mysql ${MYSQL_CONN_PARAMS} -uroot -p"${PASSWORD}" -e "SELECT 1;" >/dev/null 2>&1; then echo "WARNING: primary password retrieved from cluster failed authentication; will try fallback password" >&2 PASSWORD_VALID=false fi fi # if password failed, look for alternate password from the mariadbdatabaseaccount # spec directly. assume we are in root pw flight if [ "${PASSWORD_VALID}" = "false" ]; then MARIADB_ACCOUNT=$(echo "${GALERA_CR}" | python3 -c "import json, sys; print(json.load(sys.stdin)['spec']['rootDatabaseAccount'] or '${GALERA_INSTANCE}-mariadb-root')") MARIADB_ACCOUNT_CR=$(curl -s \ --cacert ${CACERT} \ --header "Content-Type:application/json" \ --header "Authorization: Bearer ${TOKEN}" \ "${APISERVER}/${MARIADB_API}/namespaces/${NAMESPACE}/mariadbaccounts/${MARIADB_ACCOUNT}") # look in spec.secret FALLBACK_SECRET_NAME=$(echo "${MARIADB_ACCOUNT_CR}" | python3 -c "import json, sys; print(json.load(sys.stdin)['spec']['secret'])") # Get the new password from the fallback secret PASSWORD=$(curl -s \ --cacert ${CACERT} \ --header "Content-Type:application/json" \ --header "Authorization: Bearer ${TOKEN}" \ "${APISERVER}/${K8S_API}/namespaces/${NAMESPACE}/secrets/${FALLBACK_SECRET_NAME}" \ | python3 -c "import json, sys; print(json.load(sys.stdin)['data']['DatabasePassword'])" \ | base64 -d) # test again; warn if it doesn't work, however write to my.cnf in any # case to allow the calling script to continue if ! mysql ${MYSQL_CONN_PARAMS} -uroot -p"${PASSWORD}" -e "SELECT 1;" >/dev/null 2>&1; then echo "WARNING: Both primary and fallback passwords failed authentication, will maintain fallback password" >&2 fi fi MYSQL_PWD="${PASSWORD}" DB_ROOT_PASSWORD="${PASSWORD}" # Cache credentials to $PW_CACHE_FILE. # Create the directory if it doesn't exist PW_CACHE_DIR=$(dirname "${PW_CACHE_FILE}") if [ ! -d "${PW_CACHE_DIR}" ]; then if ! mkdir -p "${PW_CACHE_DIR}" 2>/dev/null; then echo "WARNING: Failed to create directory ${PW_CACHE_DIR} due to permissions; will try again later" >&2 fi fi if ! cat > "${PW_CACHE_FILE}" </dev/null [client] user=root password=********** EOF then # we are called for the first time from detect_gcomm_and_start.sh which is # called **before** kolla can set directory permissions; so when writing # the file, proceed even if we can't write the file yet echo "WARNING: Failed to write to ${PW_CACHE_FILE} due to permissions; will try again later" >&2 fi # Set restrictive permissions on .my.cnf (only if file was successfully written) if [ -f "${PW_CACHE_FILE}" ]; then if ! chmod 600 "${PW_CACHE_FILE}" 2>/dev/null; then echo "WARNING: Failed to set permissions on ${PW_CACHE_FILE}; will try again later" >&2 fi fi export MYSQL_PWD export DB_ROOT_PASSWORD mysql_shutdown.sh: | #!/bin/bash source /var/lib/operator-scripts/mysql_root_auth.sh # NOTE(dciabrin) we might use downward API to populate those in the future PODNAME=$HOSTNAME SERVICE=${PODNAME/-galera-[0-9]*/} MYSQL_SOCKET=/var/lib/mysql/mysql.sock # API server config APISERVER=https://kubernetes.default.svc SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) TOKEN=$(cat ${SERVICEACCOUNT}/token) CACERT=${SERVICEACCOUNT}/ca.crt function log() { echo "$(date +%F_%H_%M_%S) `basename $0` $*" } # Log in mariadb's log file if configured, so the output of this script # is captured when logToDisk is enabled in the galera CR LOGFILE=$(my_print_defaults mysqld | grep log-error | cut -d= -f2) if [ -f "$LOGFILE" ]; then exec &> >(cat >> "$LOGFILE") 2>&1 else exec &> >(cat >> /proc/1/fd/1) 2>&1 fi # if the mysql socket is not available, mysql is either not started or # not reachable, orchestration stops here. if [ ! -e $MYSQL_SOCKET ]; then exit 0 fi # On update, k8s performs a rolling restart, but on resource deletion, # all pods are deleted concurrently due to the fact that we require # PodManagementPolicy: appsv1.ParallelPodManagement for bootstrapping # the cluster. So try to stop the nodes sequentially so that # the last galera node stopped can set a "safe_to_bootstrap" flag. if curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" -X GET ${APISERVER}/api/v1/namespaces/openstack/pods/${PODNAME} | grep -q '"code": *401'; then log "Galera resource is being deleted" nth=$(( ${PODNAME//*-/} + 1 )) while : ; do size=$(mysql -uroot -p"${DB_ROOT_PASSWORD}" -sNEe "show status like 'wsrep_cluster_size';" | tail -1) if [ ${size:-0} -gt $nth ]; then log "Waiting for cluster to scale down" sleep 2 else break fi done fi # We now to need disconnect the clients so that when the server will # initiate its shutdown, they won't receive unexpected WSREP statuses # when running SQL queries. # Note: It is safe to do it now, as k8s already removed this pod from # the service endpoint, so client won't reconnect to it. log "Close all active connections to this local galera node" # filter out system and localhost connections, only consider clients with a port in the host field # from that point, clients will automatically reconnect to another node CLIENTS=$(mysql -uroot -p${DB_ROOT_PASSWORD} -nN -e "select id from information_schema.processlist where host like '%:%';") echo -n "$CLIENTS" | tr '\n' ',' | xargs -r mysqladmin -uroot -p${DB_ROOT_PASSWORD} kill # At this point no clients are connected anymore. # We can finish this pre-stop hook and let k8s send the SIGTERM to the # mysql server to make it disconnect from the galera cluster and shut down. # Note: shutting down mysql here would cause the pod to finish too early, # and this pre-stop hook would shows up as 'Failed' in k8s events. exit 0 mysql_wsrep_notify.sh: | #!/bin/bash # NOTE(dciabrin) we might use downward API to populate those in the future PODNAME=$HOSTNAME SERVICE=${PODNAME/-galera-[0-9]*/} # API server config APISERVER=https://kubernetes.default.svc SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) TOKEN=$(cat ${SERVICEACCOUNT}/token) CACERT=${SERVICEACCOUNT}/ca.crt # OSPRH-17604: use default timeout and retry parameters for fast failover # default parameters for curl calls to the API server : ${WSREP_NOTIFY_CURL_CONNECT_TIMEOUT:=5} : ${WSREP_NOTIFY_CURL_MAX_TIME:=30} CURL="curl --connect-timeout ${WSREP_NOTIFY_CURL_CONNECT_TIMEOUT} --max-time ${WSREP_NOTIFY_CURL_MAX_TIME}" # defaults parameters for retry on error : ${WSREP_NOTIFY_RETRIES:=30} : ${WSREP_NOTIFY_RETRY_WAIT:=1} ## ## Utilities functions ## ## NOTE: mysql diverts this script's stdout, but stderr is logged to the ## configured log-error file (e.g. /var/log/mariadb/mariadb.log) function log() { echo "$(date +%F_%H_%M_%S) `basename $0` $*" >&2 } function log_error() { echo "$(date +%F_%H_%M_%S) `basename $0` ERROR: $*" >&2 } function mysql_get_status { local name=$1 mysql -nNE -uroot -p"${DB_ROOT_PASSWORD}" -e "show status like '${name}';" | tail -1 local rc=$? [ $rc = 0 ] || log_error "could not get value of mysql variable '${name}' (rc=$rc)" } function mysql_get_members { # The up-to-date list of members in this partition are precisely the incoming gcomm # addresses, which can be extracted from mysql status. # system table mysql.wsrep_cluster_members also exposes that information, but it # contains stale information during a state transition (e.g. when a node just # disappeared). # For accuracy, we only rely on current mysql status, as the data exactly matches # the value of argument `--members` passed to this script. local addresses local rc addresses=$(mysql_get_status wsrep_incoming_addresses) rc=$? if [ $rc = 0 ]; then # galera-0.subdomain:3306,galera-1.subdomain:3306,galera-2.subdomain:3306 echo -n "${addresses}" | tr ',' '\n' | cut -d. -f1 fi return $rc } # When optional script parameters are not provided, set up the environment # variables with the latest WSREP state retrieved from mysql function mysql_probe_state { [ "$1" = "reprobe" ] && unset UUID PARTITION INDEX SIZE MEMBERS : ${UUID=$(mysql_get_status wsrep_gcomm_uuid)} : ${PARTITION=$(mysql_get_status wsrep_cluster_status)} : ${INDEX=$(mysql_get_status wsrep_local_index)} : ${SIZE=$(mysql_get_status wsrep_cluster_size)} : ${MEMBERS=$(mysql_get_members)} [ -n "${UUID}" -a -n "${PARTITION}" -a -n "${INDEX}" -a -n "${SIZE}" -a -n "${MEMBERS}" ] } # REST API call to the k8s API server function api_server { local request=$1 local service=$2 # NOTE: a PUT request to the API server is basically a conditional write, # it only succeeds if no other write have been done on the CR in the mean time, # (i.e. if the timestamp of the JSON that is being sent to the API server matches # the timestamp of the service CR in the cluster) if [ "$request" = "PUT" ]; then request="$request -d @-" fi local output output=$(${CURL} -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" --request $request ${APISERVER}/api/v1/namespaces/${NAMESPACE}/services/${service}) local rc=$? if [ $rc != 0 ]; then log_error "call to API server failed for service ${service} (rc=$rc)" return 1 fi if echo "${output}" | grep -q '"status": "Failure"'; then message=$(echo "${output}" | parse_output '["message"]') code=$(echo "${output}" | parse_output '["code"]') if [ "${code}" = 401 ]; then # Unauthorized means the token is no longer valid as the galera # resource is in the process of being deleted. return 2 fi log_error "API server returned an error for service ${SERVICE}: ${message} (code=${code})" return 1 fi echo "${output}" return 0 } # Update the service's active endpoint # (parse JSON with python3 as we don't have jq in the container image) function service_endpoint { local endpoint="$1" # note: empty endpoint means "block incoming traffic", so the selector must still # be present, otherwise k8s would balance incoming traffic to _any_ available pod. python3 -c 'import json,sys;s=json.load(sys.stdin);s["spec"]["selector"]["statefulset.kubernetes.io/pod-name"]="'${endpoint}'";print(json.dumps(s,indent=2))' [ $? == 0 ] || log_error "Could not parse json endpoint (rc=$?)" } # retrieve data from a JSON structure # (parse JSON with python3 as we don't have jq in the container image) function parse_output { local key=$1 python3 -c 'import json,sys;s=json.load(sys.stdin);print(s'${key}')' [ $? == 0 ] || log_error "Could not parse json endpoint (rc=$?)" } # Generic retry logic for an action function function retry { local action=$1 local retries=$WSREP_NOTIFY_RETRIES local wait=$WSREP_NOTIFY_RETRY_WAIT local rc=1 $action rc=$? while [ $rc -ne 0 -a $retries -gt 0 ]; do # if API call are unauthorized, the resource is being deleted # exit now as there is nothing more to do if [ $rc -eq 2 ]; then log "galera resource is being deleted, exit now." return 0 fi log_error "previous action failed, retrying." sleep $wait $action rc=$? retries=$((retries - 1)) # reprobe mysql state now, as if the cluster state changed since # the start of this script, we might not need to retry the action mysql_probe_state reprobe done if [ $rc -ne 0 ]; then log_error "Could not run action after ${WSREP_NOTIFY_RETRIES} tries. Stop retrying." fi return $rc } ## ## Actions ## ## Change the current Active endpoint in a service function reconfigure_service_endpoint { if [ $PARTITION != "Primary" -o "$INDEX" != "0" ]; then log "Node ${PODNAME} is not the first member of a Primary partion (index: ${INDEX}). Exiting" return 0 fi CURRENT_SVC=$(api_server GET "$SERVICE") local rc=$? [ $rc == 0 ] || return $rc CURRENT_ENDPOINT=$(echo "$CURRENT_SVC" | parse_output '["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")') [ $? == 0 ] || return 1 # do not reconfigure endpoint if unecessary, to avoid client disconnections if [ -n "${CURRENT_ENDPOINT}" ] && echo "$MEMBERS" | grep -q "^${CURRENT_ENDPOINT}\$"; then log "Active endpoint ${CURRENT_ENDPOINT} is still part of the primary partition. Nothing to be done." return 0 fi if [ "${CURRENT_ENDPOINT}" == "${PODNAME}" ]; then log "Node ${PODNAME} is currently the active endpoint for service ${SERVICE}. Nothing to be done." return 0 fi NEW_SVC=$(echo "$CURRENT_SVC" | service_endpoint "$PODNAME") [ $? == 0 ] || return 1 log "Setting ${PODNAME} as the new active endpoint for service ${SERVICE}" UPDATE_RESULT=$(echo "$NEW_SVC" | api_server PUT "$SERVICE") [ $? == 0 ] || return 1 return 0 } ## Failover to another node if we are the current Active endpoint function failover_service_endpoint { if [ $PARTITION != "Primary" ]; then log "Node ${PODNAME} is not the Primary partion. Nothing to be done." return 0 fi CURRENT_SVC=$(api_server GET "$SERVICE") local rc=$? [ $rc == 0 ] || return $rc CURRENT_ENDPOINT=$(echo "$CURRENT_SVC" | parse_output '["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")') [ $? == 0 ] || return 1 if [ "${CURRENT_ENDPOINT}" != "${PODNAME}" ]; then log "Node ${PODNAME} is not the active endpoint. Nothing to be done." return 0 fi # select the first available node in the primary partition to be the failover endpoint NEW_ENDPOINT=$(echo "$MEMBERS" | grep -v "${PODNAME}" | head -1) if [ -z "${NEW_ENDPOINT}" ]; then log "No other available node to become the active endpoint." fi NEW_SVC=$(echo "$CURRENT_SVC" | service_endpoint "$NEW_ENDPOINT") [ $? == 0 ] || return 1 log "Configuring a new active endpoint for service ${SERVICE}: '${CURRENT_ENDPOINT}' -> '${NEW_ENDPOINT}'" UPDATE_RESULT=$(echo "$NEW_SVC" | api_server PUT "$SERVICE") [ $? == 0 ] || return 1 return 0 } ## Change the Active endpoint from the service function remove_service_endpoint { CURRENT_SVC=$(api_server GET "$SERVICE") local rc=$? [ $rc == 0 ] || return $rc CURRENT_ENDPOINT=$(echo "$CURRENT_SVC" | parse_output '["spec"]["selector"].get("statefulset.kubernetes.io/pod-name","")') [ $? == 0 ] || return 1 if [ "${CURRENT_ENDPOINT}" != "${PODNAME}" ]; then log "Node ${PODNAME} is currently not the active endpoint for service ${SERVICE}. Nothing to be done." return 0 fi NEW_SVC=$(echo "$CURRENT_SVC" | service_endpoint "") [ $? == 0 ] || return 1 log "Removing ${PODNAME} endpoint from service ${SERVICE}" UPDATE_RESULT=$(echo "$NEW_SVC" | api_server PUT "$SERVICE") [ $? == 0 ] || return 1 return 0 } ## Main log "called with args: $*" # Galera always calls script with --status argument # All other optional arguments (uuid,partition,index...): # UUID: cluster's current UUID # MEMBERS: galera node connected to the cluster # SIZE: number of nodes in the cluster # INDEX: member index in the cluster # PARTITION: cluster partition we're in (Primary, Non-primary) while [ $# -gt 0 ]; do case $1 in --status) STATUS=$2 shift;; --members) MEMBERS=$(echo "$2" | tr ',' '\n' | cut -d/ -f2) SIZE=$(echo "$MEMBERS" | wc -l) shift;; --primary) [ "$2" = "yes" ] && PARTITION="Primary" [ "$2" = "no" ] && PARTITION="Non-primary" shift;; --index) INDEX=$2 shift;; --uuid) shift;; esac shift done if [ -z "${STATUS}" ]; then log_error called without --status STATUS exit 1 fi # Contition: ask for a failover. This should be called when mysql is running if echo "${STATUS}" | grep -i -q -e 'failover'; then # note: make sure that the root credentials are up to date # before invoking any mysql command source /var/lib/operator-scripts/mysql_root_auth.sh mysql_probe_state if [ $? != 0 ]; then log_error "Could not probe missing mysql information. Aborting" fi retry "failover_service_endpoint" fi # Condition: disconnecting -> remove oneself from endpoint if Active if echo "${STATUS}" | grep -i -q -e 'disconnecting'; then retry "remove_service_endpoint" exit $? fi # Conditions that do not require endpoint updates if echo "${STATUS}" | grep -i -q -v -e 'synced'; then exit 0 fi # At this point mysql is started, query missing arguments # note: make sure that the root credentials are up to date # before invoking any mysql command source /var/lib/operator-scripts/mysql_root_auth.sh mysql_probe_state if [ $? != 0 ]; then log_error "Could not probe missing mysql information. Aborting" fi # Condition: first member of the primary partition -> set as Active endpoint if [ $PARTITION = "Primary" -a $SIZE -ge 0 -a "$INDEX" = "0" ]; then retry "reconfigure_service_endpoint" exit $? fi kind: ConfigMap metadata: creationTimestamp: '2026-02-16T21:35:48Z' managedFields: - apiVersion: v1 fieldsType: FieldsV1 fieldsV1: f:data: .: {} f:detect_gcomm_and_start.sh: {} f:detect_last_commit.sh: {} f:mysql_bootstrap.sh: {} f:mysql_probe.sh: {} f:mysql_root_auth.sh: {} f:mysql_shutdown.sh: {} f:mysql_wsrep_notify.sh: {} f:metadata: f:ownerReferences: .: {} k:{"uid":"cfcc3236-74a2-4810-ad1c-3f5f8a1ab11a"}: {} manager: manager operation: Update time: '2026-02-16T21:35:48Z' name: openstack-cell1-scripts namespace: openstack ownerReferences: - apiVersion: mariadb.openstack.org/v1beta1 blockOwnerDeletion: true controller: true kind: Galera name: openstack-cell1 uid: cfcc3236-74a2-4810-ad1c-3f5f8a1ab11a resourceVersion: '28721' uid: 2867ccd8-f2a8-4269-bf2b-c5f78ad9ab68