From 30b06b426c8ecc6b559ffe8202aac4cb882a8696 Mon Sep 17 00:00:00 2001 From: Jack Challen Date: Mon, 3 Feb 2025 13:03:46 +0000 Subject: [PATCH] wekachecker architecture doesn't handle separated test like this Wekachecker tries to be the governing over-arching controller, so having a single script that parses and tests multiple conditions separately doesn't fit with that architecture. Therefore make every test self-contained --- .../performance/.01_small_io_writes.test | 13 -- scripts.d/performance/.02_faulty_drive.test | 13 -- scripts.d/performance/.03_slow_drives.test | 13 -- scripts.d/performance/.04_drive_trimming.test | 13 -- .../.05_hard_quota_limit_full.test | 13 -- .../performance/.06_network_congestion.test | 13 -- scripts.d/performance/.07_faulty_nic.test | 13 -- .../performance/001_performance_tests.sh | 113 ---------------- scripts.d/performance/001_small_io_writes.sh | 123 ++++++++++++++++++ scripts.d/performance/002_faulty_drive.sh | 123 ++++++++++++++++++ scripts.d/performance/003_slow_drives.sh | 123 ++++++++++++++++++ scripts.d/performance/004_drive_trimming.sh | 122 +++++++++++++++++ scripts.d/performance/005_hard_quota_limit.sh | 123 ++++++++++++++++++ .../performance/006_network_congestion.sh | 122 +++++++++++++++++ scripts.d/performance/007_faulty_nic.sh | 123 ++++++++++++++++++ 15 files changed, 859 insertions(+), 204 deletions(-) delete mode 100644 scripts.d/performance/.01_small_io_writes.test delete mode 100644 scripts.d/performance/.02_faulty_drive.test delete mode 100644 scripts.d/performance/.03_slow_drives.test delete mode 100644 scripts.d/performance/.04_drive_trimming.test delete mode 100644 scripts.d/performance/.05_hard_quota_limit_full.test delete mode 100644 scripts.d/performance/.06_network_congestion.test delete mode 100644 scripts.d/performance/.07_faulty_nic.test delete mode 100644 scripts.d/performance/001_performance_tests.sh create mode 100644 scripts.d/performance/001_small_io_writes.sh create mode 100644 scripts.d/performance/002_faulty_drive.sh create mode 100644 scripts.d/performance/003_slow_drives.sh create mode 100644 scripts.d/performance/004_drive_trimming.sh create mode 100644 scripts.d/performance/005_hard_quota_limit.sh create mode 100644 scripts.d/performance/006_network_congestion.sh create mode 100644 scripts.d/performance/007_faulty_nic.sh diff --git a/scripts.d/performance/.01_small_io_writes.test b/scripts.d/performance/.01_small_io_writes.test deleted file mode 100644 index d16f9b5..0000000 --- a/scripts.d/performance/.01_small_io_writes.test +++ /dev/null @@ -1,13 +0,0 @@ -TEST_DESCRIPTION="Small IO writes" -INTERNAL_REFERENCE="WEKAPP-XXXXX" -CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:more_than:94:-2m" -DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:more_than:250:-2m" -CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:more_than:5:-2m" -DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" -CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" -CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" diff --git a/scripts.d/performance/.02_faulty_drive.test b/scripts.d/performance/.02_faulty_drive.test deleted file mode 100644 index 68f1cfc..0000000 --- a/scripts.d/performance/.02_faulty_drive.test +++ /dev/null @@ -1,13 +0,0 @@ -TEST_DESCRIPTION="Faulty drive" -INTERNAL_REFERENCE="WEKAPP-365973" -CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:more_than:94:-2m" -DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" -CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" -DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" -CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" -CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" diff --git a/scripts.d/performance/.03_slow_drives.test b/scripts.d/performance/.03_slow_drives.test deleted file mode 100644 index aa1ed7b..0000000 --- a/scripts.d/performance/.03_slow_drives.test +++ /dev/null @@ -1,13 +0,0 @@ -TEST_DESCRIPTION="Slow drives" -INTERNAL_REFERENCE="WEKAPP-XXXXX" -CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:more_than:94:-2m" -DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:more_than:250:-2m" -CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" -DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" -CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" -CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" diff --git a/scripts.d/performance/.04_drive_trimming.test b/scripts.d/performance/.04_drive_trimming.test deleted file mode 100644 index 4c03af7..0000000 --- a/scripts.d/performance/.04_drive_trimming.test +++ /dev/null @@ -1,13 +0,0 @@ -TEST_DESCRIPTION="Drive undergoing TRIM" -INTERNAL_REFERENCE="WEKAPP-XXXXX" -CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" -DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" -CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" -DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:less_than:700:-2m" -CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" -CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" diff --git a/scripts.d/performance/.05_hard_quota_limit_full.test b/scripts.d/performance/.05_hard_quota_limit_full.test deleted file mode 100644 index c74338e..0000000 --- a/scripts.d/performance/.05_hard_quota_limit_full.test +++ /dev/null @@ -1,13 +0,0 @@ -TEST_DESCRIPTION="Hard quota full" -INTERNAL_REFERENCE="WEKAPP-XXXXX" -CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:less_than:10ms:-2m" -CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" -DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" -CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:more_than:5:-2m" -DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:less_than:700:-2m" -CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" -CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" diff --git a/scripts.d/performance/.06_network_congestion.test b/scripts.d/performance/.06_network_congestion.test deleted file mode 100644 index 44344a5..0000000 --- a/scripts.d/performance/.06_network_congestion.test +++ /dev/null @@ -1,13 +0,0 @@ -TEST_DESCRIPTION="Network Congestion" -INTERNAL_REFERENCE="WEKAPP-XXXXX" -CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" -DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" -CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" -DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" -CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:less_than:90:-2m" -CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" diff --git a/scripts.d/performance/.07_faulty_nic.test b/scripts.d/performance/.07_faulty_nic.test deleted file mode 100644 index da29087..0000000 --- a/scripts.d/performance/.07_faulty_nic.test +++ /dev/null @@ -1,13 +0,0 @@ -TEST_DESCRIPTION="Faulty NIC" -INTERNAL_REFERENCE="WEKAPP-XXXXX" -CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" -CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" -DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" -DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" -CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" -DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" -CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" -CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:more_than:0.0005:-2m" diff --git a/scripts.d/performance/001_performance_tests.sh b/scripts.d/performance/001_performance_tests.sh deleted file mode 100644 index c73d99c..0000000 --- a/scripts.d/performance/001_performance_tests.sh +++ /dev/null @@ -1,113 +0,0 @@ -#!/bin/bash - -declare -A DRIVE_TESTS -declare -A CLUSTER_TESTS - -DESCRIPTION="Examine WEKA stats for known performance impacts" -SCRIPT_TYPE="single" - - -# How this works: -# This shell loops through all .test files. -# It starts out assuming the test matches (i.e. the result is true). -# For every condition that is found in the .test file: if any is found to be false, it then marks the entire test as "not matching" -# In other words: assume it's matching, until it encounters something that says it doesn't apply. -TEST_RESULTS_MATCHED="1" -RESULT=0 - -convert_to_standard_units() { - local VALUE="$1" - # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately - # This means we're going to convert using sed and awk, for which I apologize - # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. - VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') - STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) - echo ${STANDARDIZED_VALUE} -} - -for TESTFILE in .*.test ; do - TEST_RESULTS_MATCHED="1" - . ${TESTFILE} - echo "Looking for ${TEST_DESCRIPTION}" - - #there's a chance we will need to break this down into very different structures of testing, so - # keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe - # for re-factoring. - for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do - echo Running ${DRIVE_TEST} - TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') - COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') - TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') - TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') - #set default values - TEST_MODE="${TEST_MODE:-individual}" - COMPARISON="${COMPARISON:-more_than}" - TEST_VALUE="${TEST_VALUE:-1}" - TIME_PERIOD="${TIME_PERIOD:-"-1m"}" - TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) - - # is this a per-disk test? - DISK_PARAM="" - if [[ ${TEST_MODE} == "individual" ]] ; then - DISK_PARAM="--param disk:*" - fi - - echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" - if [[ ${COMPARISON} == "more_than" ]] ; then - HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) - HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) - # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing - if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then - TEST_RESULTS_MATCHED="0" - fi - elif [[ ${COMPARISON} == "less_than" ]] ; then - LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) - LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) - # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing - if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then - TEST_RESULTS_MATCHED="0" - fi - fi - done - for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do - echo Running ${CLUSTER_TEST} - TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') - COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') - TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') - TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') - #set default values - TEST_MODE="${TEST_MODE:-individual}" - COMPARISON="${COMPARISON:-more_than}" - TEST_VALUE="${TEST_VALUE:-1}" - TIME_PERIOD="${TIME_PERIOD:-"-1m"}" - TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) - - # is this a per-process test? - PROCESS_PARAM="" - if [[ ${TEST_MODE} == "individual" ]] ; then - PROCESS_PARAM="--per-process" - fi - - echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" - if [[ ${COMPARISON} == "more_than" ]] ; then - HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) - HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) - # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing - if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then - TEST_RESULTS_MATCHED="0" - fi - elif [[ ${COMPARISON} == "less_than" ]] ; then - LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) - LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) - # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing - if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then - TEST_RESULTS_MATCHED="0" - fi - fi - done - - if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then - RESULT=254 - echo "The test ${TESTFILE} - looking for ${TEST_DESCRIPTION} appeared to match - please review ${INTERNAL_REFERENCE} for details" - fi -done diff --git a/scripts.d/performance/001_small_io_writes.sh b/scripts.d/performance/001_small_io_writes.sh new file mode 100644 index 0000000..ca006c4 --- /dev/null +++ b/scripts.d/performance/001_small_io_writes.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +declare -A DRIVE_TESTS +declare -A CLUSTER_TESTS + +DESCRIPTION="Examine WEKA stats for small IO writes" +SCRIPT_TYPE="single" +INTERNAL_REFERENCE="WEKAPP-XXXXX" +CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:more_than:94:-2m" +DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:more_than:250:-2m" +CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:more_than:5:-2m" +DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" +CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" +CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" + +# How this works: +# It starts out assuming the test matches (i.e. the result is true). +# For every condition that is found above: if any is found to be false, it then marks the entire test as "not matching" +# In other words: assume it's matching, until it encounters something that says it doesn't apply. +TEST_RESULTS_MATCHED="1" +RESULT=0 + +convert_to_standard_units() { + local VALUE="$1" + # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately + # This means we're going to convert using sed and awk, for which I apologize + # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. + VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') + STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) + echo ${STANDARDIZED_VALUE} +} + +#there's a chance we will need to break this down into very different structures of testing, so +# keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe +# for re-factoring. +for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do + echo Running ${DRIVE_TEST} + TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-disk test? + DISK_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + DISK_PARAM="--param disk:*" + fi + + echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done +for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do + echo Running ${CLUSTER_TEST} + TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-process test? + PROCESS_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + PROCESS_PARAM="--per-process" + fi + + echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done + +if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then + RESULT=254 + echo "The cluster statistics appeared to match this known performance impact - please review ${INTERNAL_REFERENCE} for details" +else + RESULT=0 + echo "The cluster statistics do match this known performance impact" +fi + +exit ${RESULT} + diff --git a/scripts.d/performance/002_faulty_drive.sh b/scripts.d/performance/002_faulty_drive.sh new file mode 100644 index 0000000..08d0d24 --- /dev/null +++ b/scripts.d/performance/002_faulty_drive.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +declare -A DRIVE_TESTS +declare -A CLUSTER_TESTS + +DESCRIPTION="Examine WEKA stats for badly-performing drives" +SCRIPT_TYPE="single" +INTERNAL_REFERENCE="WEKAPP-365973" +CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:more_than:94:-2m" +DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" +CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" +DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" +CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" +CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" + +# How this works: +# It starts out assuming the test matches (i.e. the result is true). +# For every condition that is found above: if any is found to be false, it then marks the entire test as "not matching" +# In other words: assume it's matching, until it encounters something that says it doesn't apply. +TEST_RESULTS_MATCHED="1" +RESULT=0 + +convert_to_standard_units() { + local VALUE="$1" + # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately + # This means we're going to convert using sed and awk, for which I apologize + # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. + VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') + STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) + echo ${STANDARDIZED_VALUE} +} + +#there's a chance we will need to break this down into very different structures of testing, so +# keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe +# for re-factoring. +for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do + echo Running ${DRIVE_TEST} + TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-disk test? + DISK_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + DISK_PARAM="--param disk:*" + fi + + echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done +for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do + echo Running ${CLUSTER_TEST} + TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-process test? + PROCESS_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + PROCESS_PARAM="--per-process" + fi + + echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done + +if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then + RESULT=254 + echo "The cluster statistics appeared to match this known performance impact - please review ${INTERNAL_REFERENCE} for details" +else + RESULT=0 + echo "The cluster statistics do match this known performance impact" +fi + +exit ${RESULT} + diff --git a/scripts.d/performance/003_slow_drives.sh b/scripts.d/performance/003_slow_drives.sh new file mode 100644 index 0000000..6cae1ca --- /dev/null +++ b/scripts.d/performance/003_slow_drives.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +declare -A DRIVE_TESTS +declare -A CLUSTER_TESTS + +DESCRIPTION="Examine WEKA stats for poorly-performing drives" +SCRIPT_TYPE="single" +INTERNAL_REFERENCE="WEKAPP-XXXXX" +CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:more_than:94:-2m" +DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:more_than:250:-2m" +CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" +DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" +CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" +CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" + +# How this works: +# It starts out assuming the test matches (i.e. the result is true). +# For every condition that is found above: if any is found to be false, it then marks the entire test as "not matching" +# In other words: assume it's matching, until it encounters something that says it doesn't apply. +TEST_RESULTS_MATCHED="1" +RESULT=0 + +convert_to_standard_units() { + local VALUE="$1" + # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately + # This means we're going to convert using sed and awk, for which I apologize + # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. + VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') + STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) + echo ${STANDARDIZED_VALUE} +} + +#there's a chance we will need to break this down into very different structures of testing, so +# keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe +# for re-factoring. +for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do + echo Running ${DRIVE_TEST} + TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-disk test? + DISK_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + DISK_PARAM="--param disk:*" + fi + + echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done +for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do + echo Running ${CLUSTER_TEST} + TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-process test? + PROCESS_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + PROCESS_PARAM="--per-process" + fi + + echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done + +if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then + RESULT=254 + echo "The cluster statistics appeared to match this known performance impact - please review ${INTERNAL_REFERENCE} for details" +else + RESULT=0 + echo "The cluster statistics do match this known performance impact" +fi + +exit ${RESULT} + diff --git a/scripts.d/performance/004_drive_trimming.sh b/scripts.d/performance/004_drive_trimming.sh new file mode 100644 index 0000000..fd15e9d --- /dev/null +++ b/scripts.d/performance/004_drive_trimming.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +declare -A DRIVE_TESTS +declare -A CLUSTER_TESTS + +DESCRIPTION="Examine WEKA stats for a drive undergoing TRIM" +SCRIPT_TYPE="single" +INTERNAL_REFERENCE="WEKAPP-XXXXX" +CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:more_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" +DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" +CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" +DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:less_than:700:-2m" +CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" +CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" +# How this works: +# It starts out assuming the test matches (i.e. the result is true). +# For every condition that is found above: if any is found to be false, it then marks the entire test as "not matching" +# In other words: assume it's matching, until it encounters something that says it doesn't apply. +TEST_RESULTS_MATCHED="1" +RESULT=0 + +convert_to_standard_units() { + local VALUE="$1" + # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately + # This means we're going to convert using sed and awk, for which I apologize + # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. + VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') + STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) + echo ${STANDARDIZED_VALUE} +} + +#there's a chance we will need to break this down into very different structures of testing, so +# keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe +# for re-factoring. +for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do + echo Running ${DRIVE_TEST} + TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-disk test? + DISK_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + DISK_PARAM="--param disk:*" + fi + + echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done +for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do + echo Running ${CLUSTER_TEST} + TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-process test? + PROCESS_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + PROCESS_PARAM="--per-process" + fi + + echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done + +if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then + RESULT=254 + echo "The cluster statistics appeared to match this known performance impact - please review ${INTERNAL_REFERENCE} for details" +else + RESULT=0 + echo "The cluster statistics do match this known performance impact" +fi + +exit ${RESULT} + diff --git a/scripts.d/performance/005_hard_quota_limit.sh b/scripts.d/performance/005_hard_quota_limit.sh new file mode 100644 index 0000000..df191b7 --- /dev/null +++ b/scripts.d/performance/005_hard_quota_limit.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +declare -A DRIVE_TESTS +declare -A CLUSTER_TESTS + +DESCRIPTION="Examine WEKA stats for quota impact" +SCRIPT_TYPE="single" +INTERNAL_REFERENCE="WEKAPP-XXXXX" +CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:less_than:10ms:-2m" +CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" +DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" +CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:more_than:5:-2m" +DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:less_than:700:-2m" +CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" +CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" + +# How this works: +# It starts out assuming the test matches (i.e. the result is true). +# For every condition that is found above: if any is found to be false, it then marks the entire test as "not matching" +# In other words: assume it's matching, until it encounters something that says it doesn't apply. +TEST_RESULTS_MATCHED="1" +RESULT=0 + +convert_to_standard_units() { + local VALUE="$1" + # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately + # This means we're going to convert using sed and awk, for which I apologize + # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. + VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') + STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) + echo ${STANDARDIZED_VALUE} +} + +#there's a chance we will need to break this down into very different structures of testing, so +# keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe +# for re-factoring. +for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do + echo Running ${DRIVE_TEST} + TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-disk test? + DISK_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + DISK_PARAM="--param disk:*" + fi + + echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done +for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do + echo Running ${CLUSTER_TEST} + TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-process test? + PROCESS_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + PROCESS_PARAM="--per-process" + fi + + echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done + +if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then + RESULT=254 + echo "The cluster statistics appeared to match this known performance impact - please review ${INTERNAL_REFERENCE} for details" +else + RESULT=0 + echo "The cluster statistics do match this known performance impact" +fi + +exit ${RESULT} + diff --git a/scripts.d/performance/006_network_congestion.sh b/scripts.d/performance/006_network_congestion.sh new file mode 100644 index 0000000..275c999 --- /dev/null +++ b/scripts.d/performance/006_network_congestion.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +declare -A DRIVE_TESTS +declare -A CLUSTER_TESTS + +DESCRIPTION="Examine WEKA stats for network congestion" +SCRIPT_TYPE="single" +INTERNAL_REFERENCE="WEKAPP-XXXXX" +CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" +DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" +CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" +DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" +CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:less_than:90:-2m" +CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:less_than:0.0005:-2m" +# How this works: +# It starts out assuming the test matches (i.e. the result is true). +# For every condition that is found above: if any is found to be false, it then marks the entire test as "not matching" +# In other words: assume it's matching, until it encounters something that says it doesn't apply. +TEST_RESULTS_MATCHED="1" +RESULT=0 + +convert_to_standard_units() { + local VALUE="$1" + # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately + # This means we're going to convert using sed and awk, for which I apologize + # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. + VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') + STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) + echo ${STANDARDIZED_VALUE} +} + +#there's a chance we will need to break this down into very different structures of testing, so +# keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe +# for re-factoring. +for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do + echo Running ${DRIVE_TEST} + TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-disk test? + DISK_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + DISK_PARAM="--param disk:*" + fi + + echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done +for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do + echo Running ${CLUSTER_TEST} + TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-process test? + PROCESS_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + PROCESS_PARAM="--per-process" + fi + + echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done + +if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then + RESULT=254 + echo "The cluster statistics appeared to match this known performance impact - please review ${INTERNAL_REFERENCE} for details" +else + RESULT=0 + echo "The cluster statistics do match this known performance impact" +fi + +exit ${RESULT} + diff --git a/scripts.d/performance/007_faulty_nic.sh b/scripts.d/performance/007_faulty_nic.sh new file mode 100644 index 0000000..f58b770 --- /dev/null +++ b/scripts.d/performance/007_faulty_nic.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +declare -A DRIVE_TESTS +declare -A CLUSTER_TESTS + +DESCRIPTION="Examine WEKA stats for underperforming NICs" +SCRIPT_TYPE="single" +INTERNAL_REFERENCE="WEKAPP-XXXXX" +CLUSTER_TESTS["ops_driver/READ_LATENCY"]="average:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/CLIENT_ROUNDTRIP_AVG"]="individual:more_than:10ms:-2m" +CLUSTER_TESTS["rpc/SERVER_PROCESSING_AVG"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/SSD_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_READ_LATENCY"]="individual:less_than:1ms:-2m" +DRIVE_TESTS["ssd/DRIVE_UTILIZATION"]="individual:less_than:90:-2m" +DRIVE_TESTS["ssd/DRIVE_LOAD"]="individual:less_than:250:-2m" +CLUSTER_TESTS["DRIVE_READ_RATIO_PER_SSD_READ"]="average:less_than:5:-2m" +DRIVE_TESTS["ssd/DRIVE_IO_TOO_LONG"]="individual:more_than:700:-2m" +CLUSTER_TESTS["network/GOODPUT_TX_RATIO"]="average:more_than:90:-2m" +CLUSTER_TESTS["network/PUMPS_TXQ_FULL"]="average:more_than:0.0005:-2m" + +# How this works: +# It starts out assuming the test matches (i.e. the result is true). +# For every condition that is found above: if any is found to be false, it then marks the entire test as "not matching" +# In other words: assume it's matching, until it encounters something that says it doesn't apply. +TEST_RESULTS_MATCHED="1" +RESULT=0 + +convert_to_standard_units() { + local VALUE="$1" + # Now we need to convert VALUE to standardised units. We can't rely on the existence of units(1) or $(systemd-analyze timestamp), unfortunately + # This means we're going to convert using sed and awk, for which I apologize + # right now we're only calculating based on units of time, and we only seem to see micro- and milliseconds. + VALUE_CALC=$(echo ${VALUE} | sed 's! *µs!/1000000!g;s! *ms!/1000!g;s! *!!g;s!s$!!g;s!%!!g') + STANDARDIZED_VALUE=$(echo | awk "{print ${VALUE_CALC}"}) + echo ${STANDARDIZED_VALUE} +} + +#there's a chance we will need to break this down into very different structures of testing, so +# keeping DRIVE vs CLUSTER tests very separate at the moment, despite the fact it looks ripe +# for re-factoring. +for DRIVE_TEST in ${!DRIVE_TESTS[@]} ; do + echo Running ${DRIVE_TEST} + TEST_MODE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${DRIVE_TESTS[${DRIVE_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-disk test? + DISK_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + DISK_PARAM="--param disk:*" + fi + + echo "Now checking to see if an individual value for ${DRIVE_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${DRIVE_TEST} ${DISK_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done +for CLUSTER_TEST in ${!CLUSTER_TESTS[@]} ; do + echo Running ${CLUSTER_TEST} + TEST_MODE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $1}') + COMPARISON=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $2}') + TEST_VALUE=$( echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $3}') + TIME_PERIOD=$(echo ${CLUSTER_TESTS[${CLUSTER_TEST}]} | awk -F: '{print $4}') + #set default values + TEST_MODE="${TEST_MODE:-individual}" + COMPARISON="${COMPARISON:-more_than}" + TEST_VALUE="${TEST_VALUE:-1}" + TIME_PERIOD="${TIME_PERIOD:-"-1m"}" + TEST_VALUE=$(convert_to_standard_units "${TEST_VALUE}" ) + + # is this a per-process test? + PROCESS_PARAM="" + if [[ ${TEST_MODE} == "individual" ]] ; then + PROCESS_PARAM="--per-process" + fi + + echo "Now checking to see if an individual value for ${CLUSTER_TEST} is ${COMPARISON} than ${TEST_VALUE} over the last ${TIME_PERIOD}" + if [[ ${COMPARISON} == "more_than" ]] ; then + HIGHEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + HIGHEST_VALUE=$(convert_to_standard_units "${HIGHEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${HIGHEST_VALUE} ${TEST_VALUE} | awk '{if ($1 < $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + elif [[ ${COMPARISON} == "less_than" ]] ; then + LOWEST_VALUE=$(weka stats --show-internal --stat ${CLUSTER_TEST} ${PROCESS_PARAM} --sort -value --start-time ${TIME_PERIOD} --output value --raw-units | tail -n 1) + LOWEST_VALUE=$(convert_to_standard_units "${LOWEST_VALUE}" ) + # Because of the "assume the test matches" logic, we only need to mark the test as not matching if the current comparison fails. Otherwise do nothing + if (( $(echo ${LOWEST_VALUE} ${TEST_VALUE} | awk '{if ($1 > $2) print 1;}') )); then + TEST_RESULTS_MATCHED="0" + fi + fi +done + +if [[ ${TEST_RESULTS_MATCHED} == "1" ]] ; then + RESULT=254 + echo "The cluster statistics appeared to match this known performance impact - please review ${INTERNAL_REFERENCE} for details" +else + RESULT=0 + echo "The cluster statistics do match this known performance impact" +fi + +exit ${RESULT} +