diff options
author | Thang Q. Nguyen <thang@os.amperecomputing.com> | 2023-05-05 11:33:27 +0300 |
---|---|---|
committer | Thang Q. Nguyen <thang@os.amperecomputing.com> | 2023-05-08 05:39:39 +0300 |
commit | 98de8b95f8076ccaa4e3613ead581d130bc76d55 (patch) | |
tree | 9f92795c621cfc0b99e4c97724b607550c71faf0 /meta-ampere/meta-mitchell/recipes-ampere | |
parent | ed7346e3a02e40eeb6357d466513d537897e592a (diff) | |
download | openbmc-98de8b95f8076ccaa4e3613ead581d130bc76d55.tar.xz |
meta-ampere: add fault monitor support
Support to detect GPIO, PSU, FAN, ... faults and turn ON/OFF fault LED.
Tested:
1. Unplug a PSU and check if Fault LED is turned ON.
2. Unplug a FAN and check if Fault LED is turned ON.
3. Stimulate GPIO fault pattern and check if the BMC can detect
Signed-off-by: Thang Q. Nguyen <thang@os.amperecomputing.com>
Signed-off-by: Hieu Huynh <hieuh@os.amperecomputing.com>
Signed-off-by: Quang Nguyen <quangn@amperecomputing.com>
Change-Id: Idfcd32953cf811fbe9299a162f604cb8fd028962
Diffstat (limited to 'meta-ampere/meta-mitchell/recipes-ampere')
3 files changed, 583 insertions, 0 deletions
diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend new file mode 100644 index 0000000000..2f3e457e38 --- /dev/null +++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend @@ -0,0 +1,14 @@ +FILESEXTRAPATHS:append := "${THISDIR}/${PN}:" + +RDEPENDS:${PN} = "bash" + +SRC_URI += " \ + file://ampere_fault_monitor.sh \ + file://ampere_check_gpio_fault.sh \ + " + +do_install() { + install -d ${D}/${sbindir} + install -m 755 ${WORKDIR}/ampere_fault_monitor.sh ${D}/${sbindir}/ + install -m 755 ${WORKDIR}/ampere_check_gpio_fault.sh ${D}/${sbindir}/ +} diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh new file mode 100755 index 0000000000..9922420b81 --- /dev/null +++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh @@ -0,0 +1,351 @@ +#!/bin/bash + +# This script monitors S0/S1 GPIO fault and detects errors from CPUs +# +# So far, there is no specification describes the behavior of LED when an error (a pattern is detected) occurs. +# So when detecting a pattern, we simply set the gpio fault flag and turn on the SYS LED. +# +# The Parttern will in the format: +# <minor_byte> <quite_gap_1second> <major_byte> <stop_condition_low_for_3seconds> +# +# Ex: pattern minor_byte=0x03, major_byte=0x02, you will see the waveform like +# _1010100...(quite gap, low for 1 second)..0111111111000000000111111111110000000000...(stop condition, low for 3 seconds).. +# +# Usage: <app_name> <socket 0/1> +# +# shellcheck source=/dev/null +source /usr/sbin/gpio-lib.sh + +# global variables + error_flag='/tmp/gpio_fault' + + # the command "cat /sys/class/gpio/gpio"$gpio_Id"/value" itself, takes 10ms~35ms to complete, depends on CPU loading + polling_minor_byte_rate=0 + polling_major_byte_rate=200000 + polling_rate=$polling_minor_byte_rate + + # the mount of low to ensure that already get out of minor_byte and is in quite gap + # this value depends on the polling_minor_byte_rate + max_low_in_minor_byte=9 + + # the mount of low to ensure that already get out of major_byte and is in stop condition + # this value depends on the polling_major_byte_rate + max_low_in_major_byte=9 + + max_low=$max_low_in_minor_byte + + # state machines: + # detecting_minor_byte=0 + # detecting_major_byte=1 + curr_state=0 + + minor_byte=0 + major_byte=0 + + gpio_status=0 + + socket=$1 + + socket1_present=151 + socket1_status=1 + + S0_fault_gpio='s0-fault-alert' + S1_fault_gpio='s1-fault-alert' + +map_event_name() { + case $major_byte in + 2) + event_major="FAULT_LED_BOOT_ERROR" + case $minor_byte in + 1) + event_minor="SOC_BOOTDEV_INIT_SEC_ERROR" + ;; + 2) + event_minor="SECJMP_FAIL_ERROR" + ;; + 3) + event_minor="UART_INIT_WARN" + ;; + 4) + event_minor="UART_TX_WARN" + ;; + 5) + event_minor="SOC_ROMPATCH_BAD_ERROR" + ;; + 6) + event_minor="SOC_ROMPATCH_RANGE_ERROR" + ;; + 7) + event_minor="SPI_INIT_ERROR" + ;; + 8) + event_minor="SPI_TX_ERROR" + ;; + 9) + event_minor="SPINOR_UNKNOW_DEVICE_WARN" + ;; + 10) + event_minor="EEPROM_BAD_NVP_HEADER_WARN" + ;; + 11) + event_minor="EEPROM_BAD_NVP_FIELD_WARN" + ;; + 12) + event_minor="EEPROM_BAD_CHECKSUM_ERROR_WARN" + ;; + 13) + event_minor="I2C_DMA_ERROR" + ;; + 14) + event_minor="I2C_TIMEOUT_ERROR" + ;; + 15) + event_minor="SOC_BOOTDEV_SPI_LOAD_ERROR" + ;; + 16) + event_minor="SOC_BOOTDEV_AUTHENTICATION_ERROR" + ;; + 17) + event_minor="PCP_POWERUP_FAILED" + ;; + 18) + event_minor="PCP_POWERDOWN_FAILED" + ;; + 19) + event_minor="CPUPLL_INIT_FAILED" + ;; + 20) + event_minor="MESHPLL_INIT_FAILED" + ;; + *) + event_minor="NOT_SUPPORT" + esac + ;; + 3) + event_major="FAULT_LED_FW_LOAD_ERROR" + case $minor_byte in + 9) + event_minor="LFS_ERROR" + ;; + *) + event_minor="NOT_SUPPORT" + esac + ;; + 4) + event_major="FAULT_LED_SECURITY_ERROR" + case $minor_byte in + 1) + event_minor="SEC_INVALID_KEY_CERT" + ;; + 2) + event_minor="SEC_INVALID_CONT_CERT" + ;; + 3) + event_minor="SEC_INVALID_ROOT_KEY" + ;; + 4) + event_minor="SEC_INVALID_SECPRO_KEY" + ;; + 5) + event_minor="SEC_INVALID_KEY_CERT_SIG" + ;; + 6) + event_minor="SEC_INVALID_CONT_CERT_SIG" + ;; + 7) + event_minor="SEC_INVALID_IMAGE_HASH" + ;; + 8) + event_minor="SEC_INVALID_PRI_VERSION" + ;; + 9) + event_minor="SEC_HUK_MISMATCH" + ;; + 10) + event_minor="SEC_FUSE_BLOW_CERT_WITHOUT_SPECIAL_BOOT_PIN" + ;; + 11) + event_minor="SEC_INVALID_CERT_SUBTYPE_STRUCT" + ;; + 12) + event_minor="SEC_TMMCFG_FAIL" + ;; + 13) + event_minor="SEC_INVALID_LCS_FROM_EFUSE" + ;; + 14) + event_minor="SEC_EFUSE_WRITE_FAILED" + ;; + 15) + event_minor="SEC_INVALID_CERT_VALUE" + ;; + 16) + event_minor="SEC_INVALID_CERT_VERSION" + ;; + *) + event_minor="NOT_SUPPORT" + ;; + esac + ;; + 5) + event_major="FAULT_LED_EXCEPTION_ERROR" + case $minor_byte in + 1) + event_minor="KERNEL_EXCEPTION_UNKNOWN_REASON_ERROR" + ;; + 2) + event_minor="KERNEL_EXCEPTION_HARD_FAULT_ERROR" + ;; + 3) + event_minor="KERNEL_EXCEPTION_BUS_FAULT_ERROR" + ;; + 4) + event_minor="KERNEL_EXCEPTION_MEMMANAGE_FAULT_ERROR" + ;; + 5) + event_minor="KERNEL_EXCEPTION_USAGE_FAULT_ERROR" + ;; + *) + event_minor="NOT_SUPPORT" + ;; + esac + ;; + *) + event_major="NOT_SUPPORT" + ;; + esac +} + +set_unset_gpio_fault_flag() { + if [ ! -f $error_flag ] && [ "$1" == 1 ] ; then + touch $error_flag + elif [ -f $error_flag ] && [ "$1" == 0 ]; then + rm $error_flag + fi +} + +toggle_state() { + if [ "$curr_state" == 0 ]; then + curr_state=1 + polling_rate=$polling_major_byte_rate + else + curr_state=0 + polling_rate=$polling_minor_byte_rate + map_event_name + echo "detected major_byte=$event_major, minor_byte=$event_minor" + set_unset_gpio_fault_flag 1 + fi +} + +save_pulse_of_byte() { + if [ "$curr_state" == 0 ]; then + minor_byte=$1 + #echo "minor_byte=$1" + else + major_byte=$1 + #echo "major_byte=$1" + fi +} + +# we do not care the pulse is 50ms or 500ms, what we care is that the number of high pulses +cnt_falling_edge_in_byte() { + local cnt_falling_edge=0 + local cnt_low=0 + + local prev=0 + local curr=0 + + while true + do + prev=$curr + curr=$gpio_status + # count the falling edges, if they occur, just reset cnt_low + if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then + cnt_falling_edge=$(( cnt_falling_edge + 1 )) + cnt_low=0 + continue + # check if we are in the quite gap or stop condition + elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then + cnt_low=$(( cnt_low + 1 )) + if [ "$cnt_low" == "$max_low" ]; then + save_pulse_of_byte "$cnt_falling_edge" + toggle_state + break + fi + fi + usleep $polling_rate + gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value) + done +} + +gpio_config_input() { + echo "$gpio_Id" > /sys/class/gpio/export + echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction +} + +gpio_number() { + local offset + local gpioPin + local str + + str=$(gpiofind "$1") + if [ "$?" == '1' ]; then + echo -1 + else + gpioid=$(echo "$str"|cut -c 9) + offset=$(echo "$str"|cut -d " " -f 2) + gpioPin=$(("$offset" + ${AST2600_GPIO_BASE[$gpioid]})) + echo "$gpioPin" + fi +} + +init_sysfs_fault_gpio() { + gpio_Id=$(gpio_number "$fault_gpio") + if [ "$gpio_Id" == "-1" ]; then + echo "Invalid GPIO number" + exit 1 + fi + + if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then + return + fi + gpio_config_input "$gpio_Id" +} + +# init +if [ "$socket" == "0" ]; then + fault_gpio="$S0_fault_gpio" +else + socket1_status=$(gpioget 0 "$socket1_present") + if [ "$socket1_status" == 1 ]; then + echo "socket 1 not present" + exit 0 + fi + fault_gpio=$S1_fault_gpio +fi + +init_sysfs_fault_gpio + +# daemon start +while true +do + # detect when pattern starts + if [ "$gpio_status" == 1 ]; then + if [ "$curr_state" == 0 ]; then + # detecting minor byte, set up minor byte variables + max_low=$max_low_in_minor_byte + polling_rate=$polling_minor_byte_rate + else + # detecting major byte, set up major byte variables + max_low=$max_low_in_major_byte + polling_rate=$polling_major_byte_rate + fi + # now, there is something on gpio, check if that is a byte pattern + cnt_falling_edge_in_byte + fi + + usleep $polling_rate + gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value) +done + +exit 1 diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh new file mode 100644 index 0000000000..e176629dfd --- /dev/null +++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status + +# shellcheck disable=SC2004 +# shellcheck source=/dev/null +source /usr/sbin/gpio-lib.sh + +# common variables + on=1 + off=0 + + overtemp_fault_flag='/tmp/fault_overtemp' + +# gpio fault + gpio_fault="false" + gpio_fault_flag="/tmp/gpio_fault" + +# fan variables + fan_failed="false" + fan_failed_flag='/tmp/fan_failed' + +# PSU variables + psu_failed="false" + psu_bus=2 + psu0_addr=0x58 + psu1_addr=0x59 + status_word_cmd=0x79 + # Following the PMBus Specification + # Bit[1]: CML faults + # Bit[2]: Over temperature faults + # Bit[3]: Under voltage faults + # Bit[4]: Over current faults + # Bit[5]: Over voltage fault + # Bit[10]: Fan faults + psu_fault_bitmask=0x43e + +# led variables + fan_fault_led_status=$off + psu_fault_led_status=$off + led_bus=15 + led_addr=0x22 + led_port0_config=0x06 + led_port0_output=0x02 + +# functions declaration +check_fan_failed() { + if [[ -f $fan_failed_flag ]]; then + fan_failed="true" + else + fan_failed="false" + fi +} + +turn_on_off_fan_fault_led() { + # Control fan fault led via CPLD's I2C at slave address 0x22, I2C16. + # Get Port0 value + p0_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) + p0_val=$(("$p0_val" & ~1)) + # Config CPLD's IOepx Port0[0] from input to output, clear IOepx Port0[0]. + i2cset -f -y $led_bus $led_addr $led_port0_config $p0_val + + # Get led value + led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) + + if [ "$1" == $on ]; then + led_st=$(("$led_st" | 1)) + else + led_st=$(("$led_st" & ~1)) + fi + + # Turn on/off fan fault led + i2cset -f -y $led_bus $led_addr $led_port0_output $led_st +} + +turn_on_off_psu_fault_led() { + # Control psu fault led via CPLD's I2C at slave address 0x22, I2C16. + # Get Port1 value + p1_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) + p1_val=$(("$p1_val" & ~2)) + # Config CPLD's IOepx Port0[1] from input to output, clear IOepx Port0[1]. + i2cset -f -y $led_bus $led_addr $led_port0_config $p1_val + + # Get led value + led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) + if [ "$1" == $on ]; then + led_st=$(("$led_st" | 2)) + else + led_st=$(("$led_st" & ~2)) + fi + + # Turn on/off psu fault led + i2cset -f -y $led_bus $led_addr $led_port0_output $led_st +} + +control_fan_fault_led() { + if [ "$fan_failed" == "true" ]; then + if [ "$fan_fault_led_status" == $off ]; then + turn_on_off_fan_fault_led $on + fan_fault_led_status=$on + fi + else + if [ "$fan_fault_led_status" == $on ]; then + turn_on_off_fan_fault_led $off + fan_fault_led_status=$off + fi + fi +} + +check_psu_failed() { + local psu0_presence + local psu1_presence + local psu0_value + local psu1_value + + psu0_presence=$(gpio_name_get presence-ps0) + psu0_failed="true" + if [ "$psu0_presence" == "0" ]; then + # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD + psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w) + psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask)) + if [ "$psu0_bit_fault" == "0" ]; then + psu0_failed="false" + fi + fi + + psu1_presence=$(gpio_name_get presence-ps1) + psu1_failed="true" + if [ "$psu1_presence" == "0" ]; then + # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD + psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w) + psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask)) + if [ "$psu1_bit_fault" == "0" ]; then + psu1_failed="false" + fi + fi + + if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then + psu_failed="true" + else + psu_failed="false" + fi +} + +control_psu_fault_led() { + if [ "$psu_failed" == "true" ]; then + if [ "$psu_fault_led_status" == $off ]; then + turn_on_off_psu_fault_led $on + psu_fault_led_status=$on + fi + else + if [ "$psu_fault_led_status" == $on ]; then + turn_on_off_psu_fault_led $off + psu_fault_led_status=$off + fi + fi +} + +check_overtemp_occured() { + if [[ -f $overtemp_fault_flag ]]; then + echo "Over temperature occured, turn on fault LED" + overtemp_occured="true" + else + overtemp_occured="false" + fi +} + + +check_gpio_fault() { + if [[ -f $gpio_fault_flag ]]; then + echo "GPIO fault event(s) occured, turn on fault LED" + gpio_fault="true" + else + gpio_fault="false" + fi +} + +check_fault() { + if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \ + || [[ "$overtemp_occured" == "true" ]] \ + || [[ "$gpio_fault" == "true" ]]; then + fault="true" + else + fault="false" + fi +} + +# The System Fault Led turns on upon the system error, update the System Fault Led +# based on the Fan fault status and PSU fault status +control_sys_fault_led() { + # Turn on/off the System Fault Led + if [ "$fault" == "true" ]; then + gpio_name_set led-fault $on + else + gpio_name_set led-fault $off + fi +} + +# daemon start +while true +do + # Monitors Fan speeds + check_fan_failed + # Monitors PSU presence + check_psu_failed + + check_overtemp_occured + check_gpio_fault + # Check fault to update fail + check_fault + control_sys_fault_led + + control_fan_fault_led + control_psu_fault_led + + sleep 2 +done + +exit 1 |