#!/bin/sh
#
# db2
#
# Resource agent that manages a DB2 LUW database in Standard role 
# or HADR configuration in promotable configuration.
# Multi partition is supported as well.
#
# Copyright (c) 2011 Holger Teutsch <holger.teutsch@web.de>
#
# This agent incoporates code of a previous release created by
# Alan Robertson and the community.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs

# Use runuser if available for SELinux.
if [ -x "/sbin/runuser" ]; then
    SU="runuser"
else
    SU="su"
fi

# Parameter defaults

OCF_RESKEY_instance_default=""
OCF_RESKEY_skip_basic_sql_health_check_default="false"
OCF_RESKEY_monitor_retries_default="1"
OCF_RESKEY_monitor_retries_sleep_default="1"
OCF_RESKEY_monitor_retry_all_errors_default="false"
OCF_RESKEY_admin_default=""
OCF_RESKEY_dbpartitionnum_default="0"

: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}}
: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}}
: ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}}
: ${OCF_RESKEY_monitor_retries_sleep=${OCF_RESKEY_monitor_retries_sleep_default}}
: ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}}
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}

POSIX_UNICODE_LOCALE="C.UTF-8"
#######################################################################


db2_usage() {
    echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data"
}

db2_meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="db2" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in promotable configuration. Multiple partitions are supported.

Standard mode:

An instance including all or selected databases is made highly available.
Configure each partition as a separate primitive resource.

HADR mode:

A single database in HADR configuration is made highly available by automating takeover operations.
Configure a promotable resource with notifications enabled and an
additional monitoring operation with role "Promoted".

In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW.

In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance:

"monitor interval" &lt; HADR_PEER_WINDOW - (appr 30 sec)

"promote timeout" &lt; HADR_PEER_WINDOW + (appr 20 sec)

For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent)
</longdesc>
<shortdesc lang="en">Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as promotable configuration. Multiple partitions are supported.</shortdesc>

<parameters>
<parameter name="instance" unique="1" required="1">
<longdesc lang="en">
The instance of the database(s).
</longdesc>
<shortdesc lang="en">instance</shortdesc>
<content type="string" default="${OCF_RESKEY_instance_default}" />
</parameter>
<parameter name="dblist" unique="0" required="0">
<longdesc lang="en">
List of databases to be managed, e.g "db1 db2".
Defaults to all databases in the instance. Specify one db for HADR mode.
</longdesc>
<shortdesc lang="en">List of databases to be managed</shortdesc>
<content type="string"/>
</parameter>
<parameter name="skip_basic_sql_health_check" unique="0" required="0">
<longdesc lang="en">
Skip basic health check SQL query.

Only set to "true" when the "monitor_retries" and "monitor_retry_all_errors" parameters arent
enough to avoid issues under high load.
</longdesc>
<shortdesc lang="en">Skip basic health check SQL query</shortdesc>
<content type="boolean" default="${OCF_RESKEY_skip_basic_sql_health_check_default}" />
</parameter>
<parameter name="monitor_retries" unique="0" required="0">
<longdesc lang="en">
Monitor retries before failing.
</longdesc>
<shortdesc lang="en">Monitor retries</shortdesc>
<content type="string" default="${OCF_RESKEY_monitor_retries_default}" />
</parameter>
<parameter name="monitor_retries_sleep" unique="0" required="0">
<longdesc lang="en">
Monitor sleep between tries.
</longdesc>
<shortdesc lang="en">Monitor sleep</shortdesc>
<content type="string" default="${OCF_RESKEY_monitor_retries_sleep_default}" />
</parameter>
<parameter name="monitor_retry_all_errors" unique="0" required="0">
<longdesc lang="en">
Set to true to retry monitor-action for all errors instead of the default "db2pd" race conditions.
</longdesc>
<shortdesc lang="en">Retry monitor for all errors</shortdesc>
<content type="string" default="${OCF_RESKEY_monitor_retry_all_errors_default}" />
</parameter>
<parameter name="admin" unique="0" required="0">
<longdesc lang="en">
DEPRECATED: The admin user of the instance.
</longdesc>
<shortdesc lang="en">DEPRECATED: admin</shortdesc>
<content type="string" default="${OCF_RESKEY_admin_default}" />
</parameter>
<parameter name="dbpartitionnum" unique="0" required="0">
<longdesc lang="en">
The number of the partition (DBPARTITIONNUM) to be managed.
</longdesc>
<shortdesc lang="en">database partition number (DBPARTITIONNUM)</shortdesc>
<content type="string" default="${OCF_RESKEY_dbpartitionnum_default}" />
</parameter>
</parameters>

<actions>
<action name="start" timeout="120s"/>
<action name="stop" timeout="120s"/>
<action name="promote" timeout="120s"/>
<action name="demote" timeout="120s"/>
<action name="monitor" depth="0" timeout="60s" interval="20s"/>
<action name="monitor" depth="0" timeout="60s" role="Promoted" interval="22s"/>
<action name="validate-all" timeout="5s"/>
<action name="meta-data" timeout="5s"/>
</actions>
</resource-agent>
END
}

#
# validate
# .. and set global variables
#
# exit on error
#
db2_validate() {
    local db2home db2sql db2instance

    # db2 uses korn shell
    check_binary "ksh"

    # check required instance vars
    if [ -z "$OCF_RESKEY_instance" ]
    then
        ocf_log err "DB2 required parameter instance is not set!"
        return $OCF_ERR_CONFIGURED
    fi

    instance=$OCF_RESKEY_instance
    if [ -n "$OCF_RESKEY_admin" ]
    then
        ocf_log warn "DB2 deprecated parameter admin is set, using $OCF_RESKEY_admin as instance."
        instance=$OCF_RESKEY_admin
    fi

    db2node=${OCF_RESKEY_dbpartitionnum:-0}

    db2home=$(sh -c "echo ~$instance")
    db2sql=$db2home/sqllib
    db2profile=$db2sql/db2profile
    db2bin=$db2sql/bin

    STATE_FILE=${HA_RSCTMP}/db2-${OCF_RESOURCE_INSTANCE}.state

    #	Let's make sure a few important things are there...
    if ! [ -d "$db2sql" -a  -d "$db2bin" -a -f "$db2profile" -a \
           -x "$db2profile" -a -x "$db2bin/db2" ]
    then
        ocf_is_probe && exit $OCF_NOT_RUNNING
        ocf_log err "DB2 required directories and/or files not found"
        exit $OCF_ERR_INSTALLED
    fi

    db2instance=$(runasdb2 'echo $DB2INSTANCE')
    if [ "$db2instance" != "$instance" ]
    then
        ocf_is_probe && exit $OCF_NOT_RUNNING
        ocf_log err "DB2 parameter instance \"$instance\" != DB2INSTANCE \"$db2instance\""
        exit $OCF_ERR_CONFIGURED
    fi

    # enough checking for stop to succeed
    [ $__OCF_ACTION = stop ] && return $OCF_SUCCESS

    dblist=$OCF_RESKEY_dblist
    if [ -n "$dblist" ]
    then
        # support , as separator as well
        dblist=$(echo "$dblist" | sed -e 's/[,]/ /g')
    else
        if ! dblist=$(db2_dblist)
        then
            ocf_log err "DB2 $instance($db2node): cannot retrieve db directory"
            exit $OCF_ERR_INSTALLED
        fi
    fi

    # check requirements for the HADR case
    if ocf_is_ms
    then
        set -- $dblist
        if [ $# != 1 ]
        then
            ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have exactly one name in dblist"
            exit $OCF_ERR_CONFIGURED
        fi

        if [ $db2node != 0 ]
        then
            ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have dbpartitionnum=0"
            exit $OCF_ERR_CONFIGURED
        fi
    fi

    return $OCF_SUCCESS
}

master_score()
{
    if ! have_binary "crm_master"; then
        return
    fi

    crm_master $*
}

#
# Run the given command as db2 instance user
#
runasdb2() {
    $SU $instance -c ". $db2profile; $*"
}

#
# Run the given command as db2 instance user using $SU
# We run this function as opposed to runasdb2 whenever we have to issue commands
# that leave processes running on the system, such as db2start
# We do not want these processes to hog the resources as they were run with elevated privileges
#
runasdb2_session() {
   # Override db2profile with unicode locale is required to maintain compatibility with unicode CODEPAGE
   $SU "$instance" -c "ksh -c '. $db2profile; export LC_ALL="$POSIX_UNICODE_LOCALE"; export LANG="$POSIX_UNICODE_LOCALE"; $*'"
}

#
# Run a command as the DB2 admin, and log the output
#
logasdb2() {
    local output rc

    output=$(runasdb2 $*)
    rc=$?
    if [ $rc -eq 0 ]
    then
        ocf_log info "$output"
    else
        ocf_log err "$output"
    fi
    return $rc
}


#
# unfortunately a first connect after a crash may need several minutes
# for some internal cleanup stuff in DB2.
# We run a connect in background so other connects (i.e. monitoring!) may proceed.
#
db2_run_connect() {
    local db=$1

    logasdb2 "db2 connect to $db; db2 terminate"
}

#
# get some data from the database config
# sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW
#
db2_get_cfg() {
    local db=$1

    local output hadr_vars

    output=$(runasdb2 db2 get db cfg for $db)
    [ $? != 0 ] && return $OCF_ERR_GENERIC

    hadr_vars=$(echo "$output" |
        awk '/HADR database role/ {printf "HADR_ROLE='%s'; ", $NF;}
            /HADR_TIMEOUT/ {printf "HADR_TIMEOUT='%s'; ", $NF;}
            /First active log file/ {printf "FIRST_ACTIVE_LOG='%s'\n", $NF;}
            /HADR_PEER_WINDOW/ {printf "HADR_PEER_WINDOW='%s'\n", $NF;}')

    # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW 
    HADR_ROLE=$(echo "$output" | awk '/HADR database role/ {print $NF;}')
    HADR_TIMEOUT=$(echo "$output" | awk '/HADR_TIMEOUT/ {print $NF;}')
    FIRST_ACTIVE_LOG=$(echo "$output" | awk '/First active log file/ {print $NF;}')
    HADR_PEER_WINDOW=$(echo "$output" | awk '/HADR_PEER_WINDOW/ {print $NF;}')

    # HADR_PEER_WINDOW comes with V9 and is checked later
    if [ -z "$HADR_ROLE" -o -z "$HADR_TIMEOUT" ]
    then
        ocf_log error "DB2 cfg values invalid for $instance($db2node)/$db: $hadr_vars"
        return $OCF_ERR_GENERIC
    fi

    return $OCF_SUCCESS
}

#
# return the list of databases in the instance
#
db2_dblist() {
    local output

    output=$(runasdb2 db2 list database directory) || return $OCF_ERR_GENERIC
    
    echo "$output" | grep -i 'Database name.*=' | sed 's%.*= *%%'
}

#
# Delayed check of the compatibility of DB2 instance and pacemaker
# config.
# Logically this belongs to validate but certain parameters can only
# be retrieved once the instance is started.
#
db2_check_config_compatibility() {
    local db=$1
    local is_ms

    ocf_is_ms
    is_ms=$?

    case "$HADR_ROLE/$is_ms" in
        STANDARD/0)
        ocf_log err "DB2 database $instance/$db is not in a HADR configuration but I am a M/S resource"
        exit $OCF_ERR_INSTALLED
        ;;

        STANDARD/1)
        # OK
        ;;

        */0)
        if [ -z "$HADR_PEER_WINDOW" ]
        then
            ocf_log err "DB2 database $instance: release to old, need HADR_PEER_WINDOW (>=V9)"
            exit $OCF_ERR_INSTALLED
        fi
        ;;

        */1)
        ocf_log err "DB2 database $instance/$db is in a HADR configuration but I must be a M/S resource"
    esac

}

#
# Start HADR as standby.
#
# Parameters
#     1 - Calling function
#     2 - Calling functions line number
#
# Return codes:
#     0 - Start as standby successful
#     1 - Start as standby failed
#
reintegrateAsStandby() {
   db=$1
   reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
   ocf_log info "$__OCF_ACTION: $LINENO: reintegrateAsStandby called by $2 at $3. Attempting to reintegrate $db as standby."
   if output=$(runasdb2_session "db2 start hadr on db $db as standby"); then
      rc=0
      ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated"
   else
      case $output in
      SQL1777N*)
         # SQL1777N: HADR is already started in given state.
         ocf_log info "$__OCF_ACTION: $LINENO: $output"
         rc=0
         ;;

      *)
         rc=1
         ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc"
         ;;
      esac
   fi
   crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
   return $rc
}

#
# Start instance and DB.
# Standard mode is through "db2 activate" in order to start in previous
# mode (Standy/Primary).
# If the database is a primary AND we can determine that the running master
# has a higher "first active log" we conclude that we come up after a crash
# an the previous Standby is now Primary.
# The db is then started as Standby.
#
# Other cases: danger of split brain, log error and do nothing.
#
db2_start() {
    local output start_cmd db
    local start_opts="dbpartitionnum $db2node"

    # If we detect that db partitions are not in use, and no
    # partition is explicitly specified, activate without
    # partition information. This allows db2 instances without
    # partition support to be managed. 
    if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then
        start_opts=""
    fi

    if output=$(runasdb2 db2start $start_opts)
    then
        ocf_log info "DB2 instance $instance($db2node) started: $output"
    else
        case $output in
            *SQL1026N*)
            ocf_log info "DB2 instance $instance($db2node) already running: $output"
            ;;

            *)
            ocf_log err "$output"
            return $OCF_ERR_GENERIC
        esac
    fi

    if ! db2_instance_status
    then
        ocf_log err "DB2 instance $instance($db2node) is not active!"
        return $OCF_ERR_GENERIC
    fi

    [ $db2node = 0 ] || return $OCF_SUCCESS
    # activate DB only on node 0

    for db in $dblist
    do
        reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"

        # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG
        db2_get_cfg $db || return $?

        # Better late than never: can only check this when the instance is already up
        db2_check_config_compatibility $db

        start_cmd="db2 activate db $db"

        if [ $HADR_ROLE = PRIMARY ]
        then
            cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
            ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value'"
            if [ "$cib_value" = "1" ]; then
                ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary"
                start_cmd="db2 start hadr on db $db as standby"
                HADR_ROLE=STANDBY
                standby_reintegration=1
            fi
        fi

        if output=$(runasdb2 $start_cmd)
        then
            ocf_log info "DB2 database $instance($db2node)/$db started/activated"
            [ $HADR_ROLE != STANDBY ] && db2_run_connect $db &
        else
            case $output in
            SQL1490W* | SQL1494W* | SQL1497W* | SQL1777N*)
                # SQL1490W  Activate database is successful, however, the database has already been activated on one or more nodes.
                # SQL1494W  Activate database is successful, however, there is already a connection to the database.
                # SQL1497W  Activate/Deactivate database was successful, however, an error occurred on some nodes.
                # SQL1777N  HADR is already started.

                ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is already activated: $output"
                ;;

            SQL1768N*"Reason code = \"7\""*)
                rc="$OCF_ERR_GENERIC"

                ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is a Primary and the Standby is down"
                ocf_log err "Possible split brain! Manual intervention required."
                ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\""
                ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\".  db2_start() exit with rc=$rc."

                # let pacemaker give it another try and we will succeed then
                return "$rc"
                ;;

            SQL1776N*"Reason code = \"6\""*)
                # SQL1776N  The command cannot be issued on an HADR database.
                # Reason code 6:
                #  This database is an old primary database. It cannot be started
                #  because the standby has become the new primary through forced
                #  takeover.

                rc="$OCF_ERR_GENERIC"
                ocf_log err "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db didn't start: $output, return with rc=$rc"
                ocf_log err "$__OCF_ACTION: $LINENO: This database is an old primary database. Trying start again as standby"

                start_cmd="db2 start hadr on db $db as standby"
                if output=$(runasdb2_session "$start_cmd"); then
                    rc="$OCF_SUCCESS"
                    ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated"
                else
                    case $output in
                    SQL1777N*)
                        # SQL1777N: HADR is already started.
                        ocf_log info "$__OCF_ACTION: $LINENO: $output"
                        rc="$OCF_SUCCESS"
                        ;;

                    *)
                        rc="$OCF_ERR_GENERIC"
                        ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc"
                        ;;
                    esac
                fi

                return "$rc"
                ;;

            *)
                rc="$OCF_ERR_GENERIC"
                ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database didn't start: $output, db2_start() exit with rc=$rc."
                return "$rc"
                ;;
            esac
        fi
    done

    # come here with success
    # Even if we are a db2 Primary pacemaker requires start to end up in slave mode
    echo SLAVE > $STATE_FILE

    # Unset primary failover attribute as host was successfully reintegrated as standby
    if [ "$standby_reintegration" = "1" ]; then
        for db in $dblist; do
            reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
            crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
        done
    fi

    return $OCF_SUCCESS
}

#
# helper function to be spawned
# so we can detect a hang of the db2stop command
#
db2_stop_bg() {
    local rc output
    local stop_opts="dbpartitionnum $db2node"

    rc=$OCF_SUCCESS

    if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then
        stop_opts=""
    fi

    if output=$(runasdb2 db2stop force $stop_opts)
    then
        ocf_log info "DB2 instance $instance($db2node) stopped: $output"
    else
        case $output in
            *SQL1032N*)
            #SQL1032N  No start database manager command was issued
            ocf_log info "$output"
            ;;

            *)
            ocf_log err "DB2 instance $instance($db2node) stop failed: $output"
            rc=$OCF_ERR_GENERIC
        esac
    fi

    return $rc
}

#
# Stop the given db2 database instance
#
db2_stop() {
    local stop_timeout grace_timeout stop_bg_pid i must_kill

    # remove master score
    master_score -D -l reboot

    # be very early here in order to avoid stale data
    rm -f $STATE_FILE

    db2_instance_status
    if [ $? -eq $OCF_NOT_RUNNING ]; then
        ocf_log info "DB2 instance $instance already stopped"
        return $OCF_SUCCESS
    fi

    stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000}

    # grace_time is 4/5 (unit is ms)
    grace_timeout=$((stop_timeout/1250))

    # start db2stop in background as this may hang
    db2_stop_bg &
    stop_bg_pid=$!

    # wait for grace_timeout
    i=0
    while [ $i -lt $grace_timeout ]
    do
        kill -0 $stop_bg_pid 2>/dev/null || break;
        sleep 1
        i=$((i+1))
    done

    # collect exit status but don't hang
    if kill -0 $stop_bg_pid 2>/dev/null
    then
        stoprc=1
        kill -9 $stop_bg_pid 2>/dev/null
    else
        wait $stop_bg_pid
        stoprc=$?
    fi

    must_kill=0

    if [ $stoprc -ne 0 ]
    then
        ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill"
        must_kill=1
    elif ! db2_instance_dead
    then
        ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill"
        must_kill=1
    fi

    if [ $must_kill -eq 1 ]
    then
        # db2nkill kills *all* partitions on the node
        if [ -x $db2bin/db2nkill ]
        then
            logasdb2 $db2bin/db2nkill $db2node
        elif [ -x $db2bin/db2_kill ]
        then
            logasdb2 $db2bin/db2_kill
        fi

        # loop forever (or lrmd kills us due to timeout) until the
        # instance is dead
        while ! db2_instance_dead
        do
            ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit"
            sleep 1
        done

        ocf_log info "DB2 instance $instance($db2node) is now dead"
    fi

    return $OCF_SUCCESS
}

#
# check whether `enough´ processes for a healthy instance are up
# 
db2_instance_status() {
    local pscount

    pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- |  grep ' db2[^ ]' | wc -l)
    if [ $pscount -ge 4 ]; then
        return $OCF_SUCCESS;
    elif [ $pscount -ge 1 ]; then
        return $OCF_ERR_GENERIC
    fi
    return $OCF_NOT_RUNNING
}

#
# is the given db2 instance dead?
# 
db2_instance_dead() {
    local pscount

    pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- |  grep ' db2[^ ]' | wc -l)
    test $pscount -eq 0
}

#
# return the status of the db as "Role/Status"
# e.g. Primary/Peer, Standby/RemoteCatchupPending
#
# If not in HADR configuration return "Standard/Standalone"
#
db2_hadr_status() {
    local db=$1
    local output

    output=$(runasdb2 db2pd -hadr -db $db)
    ocf_log debug "db2_hadr_status: $output"
    if [ $? != 0 ]
    then
        echo "Down/Off"
        return 1 
    fi

    echo "$output" |
    awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"}
         /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; }
         /^HADR is not active/ {print "Standard/Standalone"; exit; }
         /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }
         /^Option -hadr requires -db <database> or -alldbs option and active database./ { exit 255 }
         /^Another possibility of this failure is the Virtual Address Space Randomization is currently enabled on this system./ { exit 255 }
         /^Changing data structure forced command termination./ { exit 255 }'
}

db2_monitor_retry() {
    local tries=$(($OCF_RESKEY_monitor_retries + 1))

    for try in $(seq $tries); do
        ocf_log debug "monitor try $try of $tries"
        db2_monitor
        rc=$?
        [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ] && [ $rc -ne $OCF_NOT_RUNNING ] && ocf_log warn "Monitor failed with rc $rc."
        if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then
            break
        fi
        [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_retries_sleep
    done

    [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC

    if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ]; then
        # instance is dead remove master score
        master_score -D -l reboot
    fi

    return $rc
}

#
# Monitor the db
# And as side effect set crm_master
#
db2_monitor() {
    local CMD output hadr db
    local rc

    db2_instance_status
    rc=$?
    if [ $rc -ne $OCF_SUCCESS ]; then
        return $rc
    fi

    [ $db2node = 0 ] || return 0
    # monitoring only for partition 0

    for db in $dblist
    do
        reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"

        #Check for the reintegration file, then set the flag if it exists and delete the file
        if [ -e "/tmp/$reint_attr" ] && [ -n "$remote_host" ]; then
            #The file exist, try to set the reintegration attribute
            crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
            cib_value=$(crm_attribute -n "$reint_attr" -N "$remote_host" -G | awk -v FS=' value=' '{print $2}')

            if [ "$cib_value" = "1" ]; then
                ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value', reintegration flag file will now be deleted."
                rm -f "/tmp/$reint_attr"
            else
                ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The reintegration flag file exists, but its attribute failed to set."
            fi
        fi

        hadr=$(db2_hadr_status $db)
        rc=$?
        ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr"
        if [ "$rc" -eq 255 ]; then
            if [ "$__OCF_ACTION" = "monitor" ]; then
                return $rc
            else
                return $OCF_ERR_GENERIC
            fi
        elif [ "$rc" -ne 0 ]; then
            return $OCF_ERR_GENERIC
        fi

        # set master preference accordingly
        case "$hadr" in
            PRIMARY/*|Primary/*|Standard/*)
            if ! ocf_is_true "$OCF_RESKEY_skip_basic_sql_health_check"; then
                # perform  a basic health check
                CMD="if db2 connect to $db;
                then
                    db2 select \* from sysibm.sysversions ; rc=\$?;
                    db2 terminate;
                else
                    rc=\$?;
                fi;
                exit \$rc"

                if ! output=$(runasdb2 $CMD)
                then
                    case "$output" in
                        SQL1776N*)
                        # can't connect/select on standby, may be spurious turing takeover
                        ;;

                        *)
                        ocf_log err "DB2 database $instance($db2node)/$db is not working"
                        ocf_log err "DB2 message: $output"

                        # dead primary, remove master score
                        master_score -D -l reboot
                        return $OCF_ERR_GENERIC
                    esac
                fi
            fi

            ocf_log debug "DB2 database $instance($db2node)/$db appears to be working"
            ocf_is_ms && master_score -v 10000 -l reboot
            ;;

            STANDBY/*PEER/*|Standby/*Peer)
            # If db is in standby peer, then it has already reintegrated.
            # If the reintegrate flag is still set, remove it
            cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
            if [ "$cib_value" = "1" ]; then
               ocf_log info "$__OCF_ACTION: $LINENO: Reintegrate flag detected for $db, but it has already reintegrated as standby. Removing reintegration flag."
               crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
            fi

            master_score -v 8000 -l reboot
            ;;

            STANDBY/*|Standby/*)
            ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted"
            master_score -D -l reboot
            ;;

            Down/Off)
            # If db is a deactivated primary and it has a reintegration flag, then reintegrate as standby.
            cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
            if [ "$cib_value" = "1" ]; then
                output=$(runasdb2 "db2 get db cfg for $db" | grep 'HADR database role' | awk '{print $5}')
                if [ "PRIMARY" = "$output" ]; then
                   ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Database is deactivated with Primary role and the reintegration flag is set. Role: $output, Reintegration flag: $reint_attr = $cib_value"
                   # Reintegrate as the standby database.
                   if reintegrateAsStandby "$db" 'db2_monitor' $LINENO; then
                      ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration succeeded."
                      # Setting slave state here will cause rc to be OCF_SUCCESS below.
                      ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Echoing SLAVE into $STATE_FILE"
                      echo SLAVE >"$STATE_FILE"
                      # Update master score to reflect standby state.
                      master_score -v 8000 -l reboot
                   else
                      ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration failed."
                      return "$OCF_ERR_GENERIC"
                   fi
                fi
            else
                rc="$OCF_NOT_RUNNING"
                ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database has HADR status $hadr."
                ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: db2_monitor() exit with rc=$rc."
                return "$rc"
            fi
            ;;

            *)
            return $OCF_ERR_GENERIC
        esac
    done

    # everything OK, return if running as slave
    grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS

    return $OCF_RUNNING_MASTER
}

#
# Promote db to Primary
#
db2_promote() {
    # validate ensured that dblist contains only one entry
    local db=$dblist
    local i hadr output force

    # we run this twice as after a crash of the other node
    # within HADR_TIMEOUT the status may be still reported as Peer
    # although a connection no longer exists

    for i in 1 2
    do
        hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC
        ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be promoted"

        case "$hadr" in
            Standard/Standalone)
            # this case only to keep ocf-tester happy
            return $OCF_SUCCESS
            ;;

            PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer)
            # nothing to do, only update pacemaker's view
            echo MASTER > $STATE_FILE
            return $OCF_SUCCESS
            ;;

            STANDBY/PEER/CONNECTED|Standby/Peer)
            # must take over
            ;;

            STANDBY/*PEER/DISCONNECTED|Standby/DisconnectedPeer)
            # must take over by force peer window only
            force="by force peer window only"
            ;;

            # must take over by force
            STANDBY/REMOTE_CATCHUP_PENDING/DISCONNECTED)
            force="by force"
            ;;

            *)
            return $OCF_ERR_GENERIC
        esac

        if output=$(runasdb2 db2 takeover hadr on db $db $force)
        then
            # update pacemaker's view
            echo MASTER > $STATE_FILE

            return $OCF_SUCCESS
        fi

        case "$output" in
            SQL1770N*"Reason code = \"7\""*)
            # expected, HADR_TIMEOUT is now expired
            # go for the second try
            continue
            ;;

            *)
            ocf_log err "DB2 database $instance($db2node)/$db promote failed: $output"
            return $OCF_ERR_GENERIC
        esac
    done

    return $OCF_ERR_GENERIC
}

#
# Demote db to standby
#
db2_demote() {
    # validate ensured that dblist contains only one entry
    local db=$dblist
    local hadr
    
    # house keeping, set pacemaker's view to slave
    echo SLAVE > $STATE_FILE

    hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC
    ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted"

    db2_monitor
    return $?
}

########
# Main #
########
case "$__OCF_ACTION" in
    meta-data)
    db2_meta_data
    exit $OCF_SUCCESS
    ;;

    usage)
    db2_usage
    exit $OCF_SUCCESS
    ;;
esac

local_host=$(ocf_local_nodename)
inst1=$(echo "$OCF_RESKEY_instance" | cut -d"," -f1)
inst2=$(echo "$OCF_RESKEY_instance" | cut -d"," -f2)
host1=$(crm_node -l | sort | awk '{print $2;}' | sed -n 1p)

if [ "$host1" = "$local_host" ]; then
   remote_host=$(crm_node -l | sort | awk '{print $2;}' | sed -n 2p)
else
   remote_host="$host1"
fi

db2_validate; validate_rc=$?

case "$__OCF_ACTION" in
    start)
    db2_start || exit $?
    db2_monitor
    ;;

    stop)
    db2_stop
    ;;

    promote)
    db2_promote
    ;;

    demote)
    db2_demote
    ;;

    notify)
    ocf_log debug "notify-action has been DEPRECATED, and should be removed"
    ;;

    monitor)
    db2_monitor_retry
    ;;

    validate-all)
    exit $validate_rc
    ;;

    *)
    db2_usage
    exit $OCF_ERR_UNIMPLEMENTED
esac

exit $?
