diff options
author | Thang Q. Nguyen <thang@os.amperecomputing.com> | 2023-05-05 11:33:27 +0300 |
---|---|---|
committer | Thang Q. Nguyen <thang@os.amperecomputing.com> | 2023-05-08 05:39:39 +0300 |
commit | 98de8b95f8076ccaa4e3613ead581d130bc76d55 (patch) | |
tree | 9f92795c621cfc0b99e4c97724b607550c71faf0 /meta-ampere | |
parent | ed7346e3a02e40eeb6357d466513d537897e592a (diff) | |
download | openbmc-98de8b95f8076ccaa4e3613ead581d130bc76d55.tar.xz |
meta-ampere: add fault monitor support
Support to detect GPIO, PSU, FAN, ... faults and turn ON/OFF fault LED.
Tested:
1. Unplug a PSU and check if Fault LED is turned ON.
2. Unplug a FAN and check if Fault LED is turned ON.
3. Stimulate GPIO fault pattern and check if the BMC can detect
Signed-off-by: Thang Q. Nguyen <thang@os.amperecomputing.com>
Signed-off-by: Hieu Huynh <hieuh@os.amperecomputing.com>
Signed-off-by: Quang Nguyen <quangn@amperecomputing.com>
Change-Id: Idfcd32953cf811fbe9299a162f604cb8fd028962
Diffstat (limited to 'meta-ampere')
11 files changed, 1031 insertions, 0 deletions
diff --git a/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor.bb b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor.bb new file mode 100644 index 0000000000..79d7a3d294 --- /dev/null +++ b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor.bb @@ -0,0 +1,23 @@ +SUMMARY = "Ampere Computing LLC Fault Monitor" +DESCRIPTION = "Monitor fault events and update fault led status for Ampere systems" +PR = "r1" + +LICENSE = "Apache-2.0" + +LIC_FILES_CHKSUM = "file://${COMMON_LICENSE_DIR}/Apache-2.0;md5=89aea4e17d99a7cacdbeed46a0096b10" + +inherit systemd +inherit obmc-phosphor-systemd + +FILESEXTRAPATHS:append := "${THISDIR}/${PN}:" + +SYSTEMD_SERVICE:${PN} = "ampere-fault-monitor.service" + +GPIO_FAULT_START_TGT = "ampere-check-gpio-fault@.service" +GPIO_FAULT_START_S0_INSTMPL = "ampere-check-gpio-fault@{0}.service" +SYSTEMD_SERVICE:${PN} += "${GPIO_FAULT_START_TGT}" + +HOST_ON_STARTMIN_TGTFMT = "obmc-host-startmin@{0}.target" +GPIO_FAULT_START_S0_STARTMIN_FMT = "../${GPIO_FAULT_START_TGT}:${HOST_ON_STARTMIN_TGTFMT}.wants/${GPIO_FAULT_START_S0_INSTMPL}" +SYSTEMD_LINK:${PN} += "${@compose_list_zip(d, 'GPIO_FAULT_START_S0_STARTMIN_FMT', 'OBMC_HOST_INSTANCES')}" + diff --git a/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-check-gpio-fault@.service b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-check-gpio-fault@.service new file mode 100644 index 0000000000..8502b73c96 --- /dev/null +++ b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-check-gpio-fault@.service @@ -0,0 +1,12 @@ +[Unit] +Description=Monitor GPIO fault and updade fault LED status %i +Before=obmc-host-start-pre@0.target +After=obmc-host-already-on@0.target +Conflicts=obmc-host-stop@0.target + +[Service] +Restart=no +ExecStart=/bin/sh -c "if [ -f /usr/sbin/ampere_check_gpio_fault.sh ]; then /usr/sbin/ampere_check_gpio_fault.sh %i; fi" +ExecStopPost=/bin/sh -c "if [ -f /tmp/gpio_fault ]; then sleep 5; rm /tmp/gpio_fault; fi" +SyslogIdentifier=ampere_check_fault_gpio +Type=simple diff --git a/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-fault-monitor.service b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-fault-monitor.service new file mode 100644 index 0000000000..ee6af1ec79 --- /dev/null +++ b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-fault-monitor.service @@ -0,0 +1,12 @@ +[Unit] +Description=Monitor fault events and updade fault LED status +After=xyz.openbmc_project.State.Host@0.service + +[Service] +Restart=on-failure +ExecStart=/usr/sbin/ampere_fault_monitor.sh +SyslogIdentifier=ampere_fault_monitor.sh +Type=simple + +[Install] +WantedBy={SYSTEMD_DEFAULT_TARGET} diff --git a/meta-ampere/meta-jade/conf/machine/mtjade.conf b/meta-ampere/meta-jade/conf/machine/mtjade.conf index 914c1605c0..f1e51bb63d 100644 --- a/meta-ampere/meta-jade/conf/machine/mtjade.conf +++ b/meta-ampere/meta-jade/conf/machine/mtjade.conf @@ -33,6 +33,7 @@ OBMC_IMAGE_EXTRA_INSTALL:append = "\ phosphor-ipmi-blobs \ phosphor-ipmi-blobs-binarystore \ ampere-driver-binder \ + ampere-fault-monitor \ " PREFERRED_PROVIDER_virtual/obmc-chassis-mgmt = "packagegroup-ampere-apps" diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend new file mode 100644 index 0000000000..2f3e457e38 --- /dev/null +++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend @@ -0,0 +1,14 @@ +FILESEXTRAPATHS:append := "${THISDIR}/${PN}:" + +RDEPENDS:${PN} = "bash" + +SRC_URI += " \ + file://ampere_fault_monitor.sh \ + file://ampere_check_gpio_fault.sh \ + " + +do_install() { + install -d ${D}/${sbindir} + install -m 755 ${WORKDIR}/ampere_fault_monitor.sh ${D}/${sbindir}/ + install -m 755 ${WORKDIR}/ampere_check_gpio_fault.sh ${D}/${sbindir}/ +} diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh new file mode 100755 index 0000000000..141c50c36e --- /dev/null +++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh @@ -0,0 +1,205 @@ +#!/bin/bash + +# This script monitors S0/S1 fault GPIO and detects errors or warnings from CPUs +# +# According to OpenBMC_Software_Funcional_Specification, section 3.16, +# +# When the BMC detects the GPIO_FAULT signal indicating an SCP booting failure: +# • If a non-critical error/warning from the SCP occurs, the BMC blinks the Fault LED once. +# • If a critical error from the SCP occurs, the BMC turns on the Fault LED. +# The BMC monitors the GPIO_FAULT signal from the SCP during SCP booting to determine whether +# the error is non-critical or critical. A fatal error is indicated when the signal is On and then Off +# continuously, followed by a “quiet” period of about three seconds, and this pattern repeats. If the “quiet” +# period is longer than three seconds, the error is non-fatal. The BMC must set up appropriate debounce +# times to detect such errors. The BMC is expected to turn on the Fault LED forever for fatal errors, or to +# turn on the Fault LED and turn it off when the fault clears for non-fatal errors. +# +# Usage: <app_name> <socket 0/1> + +# shellcheck source=/dev/null +source /usr/sbin/gpio-lib.sh + +# global variables + error_flag='/tmp/fault_err' + warning_flag='/tmp/fault_warning' + + duty_cycle=250000 + scan_pulse=100000 + blank_num=8 + + curr_pattern=0 + prev_pattern=0 + + gpio_status=0 + repeat=0 + + socket=$1 + + socket1_present=15 + socket1_status=1 + + S0_fault_gpio=73 + S1_fault_gpio=201 + +map_event_name() { + case $curr_pattern in + 1) + event_name="RAS_GPIO_INVALID_LCS" + ;; + 2) + event_name="RAS_GPIO_FILE_HDR_INVALID" + ;; + 3) + event_name="RAS_GPIO_FILE_INTEGRITY_INVALID" + ;; + 4) + event_name="RAS_GPIO_KEY_CERT_AUTH_ERR" + ;; + 5) + event_name="RAS_GPIO_CNT_CERT_AUTH_ERR" + ;; + 6) + event_name="RAS_GPIO_I2C_HARDWARE_ERR" + ;; + 7) + event_name="RAS_GPIO_CRYPTO_ENGINE_ERR" + ;; + 8) + event_name="RAS_GPIO_ROTPK_EFUSE_INVALID" + ;; + 9) + event_name="RAS_GPIO_SEED_EFUSE_INVALID" + ;; + 10) + event_name="RAS_GPIO_LCS_FROM_EFUSE_INVALID" + ;; + 11) + event_name="RAS_GPIO_PRIM_ROLLBACK_EFUSE_INVALID" + ;; + 12) + event_name="RAS_GPIO_SEC_ROLLBACK_EFUSE_INVALID" + ;; + 13) + event_name="RAS_GPIO_HUK_EFUSE_INVALID" + ;; + 14) + event_name="RAS_GPIO_CERT_DATA_INVALID" + ;; + 15) + event_name="RAS_GPIO_INTERNAL_HW_ERR" + ;; + *) + event_name="NOT_SUPPORT" + ;; + esac +} + +detect_patern_repeat() { + local prev=0 + local curr=0 + local cnt=13 + + while true + do + usleep $scan_pulse + gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value) + prev=$curr + curr=$gpio_status + if [ "$prev" == 0 ] && [ "$curr" == 1 ]; then + # patern start repeating, check if previous and current pattern are the same + repeat=1 + break + fi + if [ "$cnt" == 0 ]; then + map_event_name + echo "detected a warning from fault GPIO #$fault_gpio $socket, event $event_name" + # pattern not repeat, this is a warning, turn on warning flag + touch $warning_flag + break + fi + cnt=$(( cnt - 1 )) + done +} + +detect_pattern() { + local cnt_falling_edge=0 + local cnt_blank=0 + + local prev=0 + local curr=0 + + while true + do + prev=$curr + curr=$gpio_status + # count the falling edges, if they appear, just reset cnt_blank + if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then + cnt_falling_edge=$(( cnt_falling_edge + 1 )) + cnt_blank=0 + continue + # check if we are in the quite gap + elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then + cnt_blank=$(( cnt_blank + 1 )) + if [ "$cnt_blank" == "$blank_num" ]; then + # echo "pattern number falling_edge=$cnt_falling_edge blank=$cnt_blank" + curr_pattern=$cnt_falling_edge + # after count all falling edges, now check if patern repeat after 3s + detect_patern_repeat + break + fi + fi + usleep $scan_pulse + gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value) + done +} + +gpio_config_input() { + echo "$gpio_Id" > /sys/class/gpio/export + echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction +} + +init_sysfs_fault_gpio() { + gpio_Id=$(gpio_number "$fault_gpio") + if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then + return + fi + gpio_config_input "$fault_gpio" +} + +# init +if [ "$socket" == "0" ]; then + fault_gpio=$S0_fault_gpio +else + socket1_status=$(gpioget 0 "$socket1_present") + if [ "$socket1_status" == 1 ]; then + echo "socket 1 not present" + exit 1 + fi + fault_gpio=$S1_fault_gpio +fi + +init_sysfs_fault_gpio + +# daemon start +while true +do + # detect when pattern starts + if [ "$gpio_status" == 1 ]; then + # now, there is something on gpio, check if that is a pattern + detect_pattern + if [ "$repeat" == 1 ] && [ "$prev_pattern" == "$curr_pattern" ]; then + map_event_name + echo "detected an error from fault GPIO #$fault_gpio $socket, event#$curr_pattern $event_name" + touch $error_flag + repeat=0 + fi + prev_pattern=$curr_pattern + curr_pattern=0 + continue + fi + usleep $duty_cycle + gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value) + +done + +exit 1 diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh new file mode 100644 index 0000000000..44cbb11b13 --- /dev/null +++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh @@ -0,0 +1,180 @@ +#!/bin/bash + +# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status + +# shellcheck disable=SC2004 +# shellcheck disable=SC2046 +# shellcheck source=/dev/null + +# common variables + warning_fault_flag='/tmp/fault_warning' + error_fault_flag='/tmp/fault_err' + overtemp_fault_flag='/tmp/fault_overtemp' + fault_RAS_UE_flag='/tmp/fault_RAS_UE' + + blink_rate=100000 + + fault="false" + + on="true" + off="false" + + gpio_fault="false" + +# fan variables + fan_failed="false" + fan_failed_flag='/tmp/fan_failed' + +# PSU variables + psu_failed="false" + psu_bus=6 + psu0_addr=0x58 + psu1_addr=0x59 + status_word_cmd=0x79 + # Following the PMBus Specification + # Bit[1]: CML faults + # Bit[2]: Over temperature faults + # Bit[3]: Under voltage faults + # Bit[4]: Over current faults + # Bit[5]: Over voltage fault + # Bit[10]: Fan faults + psu_fault_bitmask=0x43e + +# led variables + led_service='xyz.openbmc_project.LED.GroupManager' + led_fault_path='/xyz/openbmc_project/led/groups/system_fault' + led_fault_interface='xyz.openbmc_project.Led.Group' + fault_led_status=$off + +# functions declaration +check_fan_failed() { + if [[ -f $fan_failed_flag ]]; then + fan_failed="true" + else + fan_failed="false" + fi +} + +turn_on_off_fault_led() { + busctl set-property $led_service $led_fault_path $led_fault_interface Asserted b "$1" >> /dev/null +} + +check_psu_failed() { + local psu0_presence + local psu1_presence + local psu0_value + local psu1_value + + psu0_presence=$(gpioget $(gpiofind PSU1_PRESENT)) + psu0_failed="true" + if [ "$psu0_presence" == "0" ]; then + # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD + psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w) + psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask)) + if [ "$psu0_bit_fault" == "0" ]; then + psu0_failed="false" + fi + fi + + psu1_presence=$(gpioget $(gpiofind PSU2_PRESENT)) + psu1_failed="true" + if [ "$psu1_presence" == "0" ]; then + # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD + psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w) + psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask)) + if [ "$psu1_bit_fault" == "0" ]; then + psu1_failed="false" + fi + fi + + if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then + psu_failed="true" + else + psu_failed="false" + fi +} + +check_fault() { + if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \ + || [[ "$gpio_fault" == "true" ]] \ + || [[ "$RAS_UE_occured" == "true" ]] \ + || [[ "$overtemp_occured" == "true" ]]; then + fault="true" + else + fault="false" + fi +} + +control_fault_led() { + if [ "$fault" == "true" ]; then + if [ "$fault_led_status" == $off ]; then + turn_on_off_fault_led $on + fault_led_status=$on + fi + else + if [ "$fault_led_status" == $on ]; then + turn_on_off_fault_led $off + fault_led_status=$off + fi + fi +} + +blink_fault_led() { + if [ "$fault_led_status" == $off ]; then + turn_on_off_fault_led $on + usleep $blink_rate + turn_on_off_fault_led $off + else + turn_on_off_fault_led $off + usleep $blink_rate + turn_on_off_fault_led $on + fi +} + +check_gpio_fault() { + if [[ -f $error_fault_flag ]]; then + gpio_fault="true" + else + if [ -f $warning_fault_flag ]; then + blink_fault_led + rm $warning_fault_flag + fi + gpio_fault="false" + fi +} + +check_RAS_UE_occured() { + if [[ -f $fault_RAS_UE_flag ]]; then + echo "RAS UE error occured, turn on fault LED" + RAS_UE_occured="true" + else + RAS_UE_occured="false" + fi +} + +check_overtemp_occured() { + if [[ -f $overtemp_fault_flag ]]; then + echo "Over temperature occured, turn on fault LED" + overtemp_occured="true" + else + overtemp_occured="false" + fi +} + +# daemon start +while true +do + check_gpio_fault + check_fan_failed + check_overtemp_occured + check_RAS_UE_occured + + # Monitors PSU presence + check_psu_failed + + check_fault + control_fault_led + sleep 2 +done + +exit 1 diff --git a/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf b/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf index aecb597e83..712401bbda 100644 --- a/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf +++ b/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf @@ -36,6 +36,7 @@ OBMC_IMAGE_EXTRA_INSTALL:append = " \ phosphor-ipmi-blobs-binarystore \ util-linux \ ampere-sysfw-hang-handler \ + ampere-fault-monitor \ " PREFERRED_PROVIDER_virtual/obmc-chassis-mgmt = "packagegroup-obmc-ampere-apps" diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend new file mode 100644 index 0000000000..2f3e457e38 --- /dev/null +++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend @@ -0,0 +1,14 @@ +FILESEXTRAPATHS:append := "${THISDIR}/${PN}:" + +RDEPENDS:${PN} = "bash" + +SRC_URI += " \ + file://ampere_fault_monitor.sh \ + file://ampere_check_gpio_fault.sh \ + " + +do_install() { + install -d ${D}/${sbindir} + install -m 755 ${WORKDIR}/ampere_fault_monitor.sh ${D}/${sbindir}/ + install -m 755 ${WORKDIR}/ampere_check_gpio_fault.sh ${D}/${sbindir}/ +} diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh new file mode 100755 index 0000000000..9922420b81 --- /dev/null +++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh @@ -0,0 +1,351 @@ +#!/bin/bash + +# This script monitors S0/S1 GPIO fault and detects errors from CPUs +# +# So far, there is no specification describes the behavior of LED when an error (a pattern is detected) occurs. +# So when detecting a pattern, we simply set the gpio fault flag and turn on the SYS LED. +# +# The Parttern will in the format: +# <minor_byte> <quite_gap_1second> <major_byte> <stop_condition_low_for_3seconds> +# +# Ex: pattern minor_byte=0x03, major_byte=0x02, you will see the waveform like +# _1010100...(quite gap, low for 1 second)..0111111111000000000111111111110000000000...(stop condition, low for 3 seconds).. +# +# Usage: <app_name> <socket 0/1> +# +# shellcheck source=/dev/null +source /usr/sbin/gpio-lib.sh + +# global variables + error_flag='/tmp/gpio_fault' + + # the command "cat /sys/class/gpio/gpio"$gpio_Id"/value" itself, takes 10ms~35ms to complete, depends on CPU loading + polling_minor_byte_rate=0 + polling_major_byte_rate=200000 + polling_rate=$polling_minor_byte_rate + + # the mount of low to ensure that already get out of minor_byte and is in quite gap + # this value depends on the polling_minor_byte_rate + max_low_in_minor_byte=9 + + # the mount of low to ensure that already get out of major_byte and is in stop condition + # this value depends on the polling_major_byte_rate + max_low_in_major_byte=9 + + max_low=$max_low_in_minor_byte + + # state machines: + # detecting_minor_byte=0 + # detecting_major_byte=1 + curr_state=0 + + minor_byte=0 + major_byte=0 + + gpio_status=0 + + socket=$1 + + socket1_present=151 + socket1_status=1 + + S0_fault_gpio='s0-fault-alert' + S1_fault_gpio='s1-fault-alert' + +map_event_name() { + case $major_byte in + 2) + event_major="FAULT_LED_BOOT_ERROR" + case $minor_byte in + 1) + event_minor="SOC_BOOTDEV_INIT_SEC_ERROR" + ;; + 2) + event_minor="SECJMP_FAIL_ERROR" + ;; + 3) + event_minor="UART_INIT_WARN" + ;; + 4) + event_minor="UART_TX_WARN" + ;; + 5) + event_minor="SOC_ROMPATCH_BAD_ERROR" + ;; + 6) + event_minor="SOC_ROMPATCH_RANGE_ERROR" + ;; + 7) + event_minor="SPI_INIT_ERROR" + ;; + 8) + event_minor="SPI_TX_ERROR" + ;; + 9) + event_minor="SPINOR_UNKNOW_DEVICE_WARN" + ;; + 10) + event_minor="EEPROM_BAD_NVP_HEADER_WARN" + ;; + 11) + event_minor="EEPROM_BAD_NVP_FIELD_WARN" + ;; + 12) + event_minor="EEPROM_BAD_CHECKSUM_ERROR_WARN" + ;; + 13) + event_minor="I2C_DMA_ERROR" + ;; + 14) + event_minor="I2C_TIMEOUT_ERROR" + ;; + 15) + event_minor="SOC_BOOTDEV_SPI_LOAD_ERROR" + ;; + 16) + event_minor="SOC_BOOTDEV_AUTHENTICATION_ERROR" + ;; + 17) + event_minor="PCP_POWERUP_FAILED" + ;; + 18) + event_minor="PCP_POWERDOWN_FAILED" + ;; + 19) + event_minor="CPUPLL_INIT_FAILED" + ;; + 20) + event_minor="MESHPLL_INIT_FAILED" + ;; + *) + event_minor="NOT_SUPPORT" + esac + ;; + 3) + event_major="FAULT_LED_FW_LOAD_ERROR" + case $minor_byte in + 9) + event_minor="LFS_ERROR" + ;; + *) + event_minor="NOT_SUPPORT" + esac + ;; + 4) + event_major="FAULT_LED_SECURITY_ERROR" + case $minor_byte in + 1) + event_minor="SEC_INVALID_KEY_CERT" + ;; + 2) + event_minor="SEC_INVALID_CONT_CERT" + ;; + 3) + event_minor="SEC_INVALID_ROOT_KEY" + ;; + 4) + event_minor="SEC_INVALID_SECPRO_KEY" + ;; + 5) + event_minor="SEC_INVALID_KEY_CERT_SIG" + ;; + 6) + event_minor="SEC_INVALID_CONT_CERT_SIG" + ;; + 7) + event_minor="SEC_INVALID_IMAGE_HASH" + ;; + 8) + event_minor="SEC_INVALID_PRI_VERSION" + ;; + 9) + event_minor="SEC_HUK_MISMATCH" + ;; + 10) + event_minor="SEC_FUSE_BLOW_CERT_WITHOUT_SPECIAL_BOOT_PIN" + ;; + 11) + event_minor="SEC_INVALID_CERT_SUBTYPE_STRUCT" + ;; + 12) + event_minor="SEC_TMMCFG_FAIL" + ;; + 13) + event_minor="SEC_INVALID_LCS_FROM_EFUSE" + ;; + 14) + event_minor="SEC_EFUSE_WRITE_FAILED" + ;; + 15) + event_minor="SEC_INVALID_CERT_VALUE" + ;; + 16) + event_minor="SEC_INVALID_CERT_VERSION" + ;; + *) + event_minor="NOT_SUPPORT" + ;; + esac + ;; + 5) + event_major="FAULT_LED_EXCEPTION_ERROR" + case $minor_byte in + 1) + event_minor="KERNEL_EXCEPTION_UNKNOWN_REASON_ERROR" + ;; + 2) + event_minor="KERNEL_EXCEPTION_HARD_FAULT_ERROR" + ;; + 3) + event_minor="KERNEL_EXCEPTION_BUS_FAULT_ERROR" + ;; + 4) + event_minor="KERNEL_EXCEPTION_MEMMANAGE_FAULT_ERROR" + ;; + 5) + event_minor="KERNEL_EXCEPTION_USAGE_FAULT_ERROR" + ;; + *) + event_minor="NOT_SUPPORT" + ;; + esac + ;; + *) + event_major="NOT_SUPPORT" + ;; + esac +} + +set_unset_gpio_fault_flag() { + if [ ! -f $error_flag ] && [ "$1" == 1 ] ; then + touch $error_flag + elif [ -f $error_flag ] && [ "$1" == 0 ]; then + rm $error_flag + fi +} + +toggle_state() { + if [ "$curr_state" == 0 ]; then + curr_state=1 + polling_rate=$polling_major_byte_rate + else + curr_state=0 + polling_rate=$polling_minor_byte_rate + map_event_name + echo "detected major_byte=$event_major, minor_byte=$event_minor" + set_unset_gpio_fault_flag 1 + fi +} + +save_pulse_of_byte() { + if [ "$curr_state" == 0 ]; then + minor_byte=$1 + #echo "minor_byte=$1" + else + major_byte=$1 + #echo "major_byte=$1" + fi +} + +# we do not care the pulse is 50ms or 500ms, what we care is that the number of high pulses +cnt_falling_edge_in_byte() { + local cnt_falling_edge=0 + local cnt_low=0 + + local prev=0 + local curr=0 + + while true + do + prev=$curr + curr=$gpio_status + # count the falling edges, if they occur, just reset cnt_low + if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then + cnt_falling_edge=$(( cnt_falling_edge + 1 )) + cnt_low=0 + continue + # check if we are in the quite gap or stop condition + elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then + cnt_low=$(( cnt_low + 1 )) + if [ "$cnt_low" == "$max_low" ]; then + save_pulse_of_byte "$cnt_falling_edge" + toggle_state + break + fi + fi + usleep $polling_rate + gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value) + done +} + +gpio_config_input() { + echo "$gpio_Id" > /sys/class/gpio/export + echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction +} + +gpio_number() { + local offset + local gpioPin + local str + + str=$(gpiofind "$1") + if [ "$?" == '1' ]; then + echo -1 + else + gpioid=$(echo "$str"|cut -c 9) + offset=$(echo "$str"|cut -d " " -f 2) + gpioPin=$(("$offset" + ${AST2600_GPIO_BASE[$gpioid]})) + echo "$gpioPin" + fi +} + +init_sysfs_fault_gpio() { + gpio_Id=$(gpio_number "$fault_gpio") + if [ "$gpio_Id" == "-1" ]; then + echo "Invalid GPIO number" + exit 1 + fi + + if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then + return + fi + gpio_config_input "$gpio_Id" +} + +# init +if [ "$socket" == "0" ]; then + fault_gpio="$S0_fault_gpio" +else + socket1_status=$(gpioget 0 "$socket1_present") + if [ "$socket1_status" == 1 ]; then + echo "socket 1 not present" + exit 0 + fi + fault_gpio=$S1_fault_gpio +fi + +init_sysfs_fault_gpio + +# daemon start +while true +do + # detect when pattern starts + if [ "$gpio_status" == 1 ]; then + if [ "$curr_state" == 0 ]; then + # detecting minor byte, set up minor byte variables + max_low=$max_low_in_minor_byte + polling_rate=$polling_minor_byte_rate + else + # detecting major byte, set up major byte variables + max_low=$max_low_in_major_byte + polling_rate=$polling_major_byte_rate + fi + # now, there is something on gpio, check if that is a byte pattern + cnt_falling_edge_in_byte + fi + + usleep $polling_rate + gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value) +done + +exit 1 diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh new file mode 100644 index 0000000000..e176629dfd --- /dev/null +++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status + +# shellcheck disable=SC2004 +# shellcheck source=/dev/null +source /usr/sbin/gpio-lib.sh + +# common variables + on=1 + off=0 + + overtemp_fault_flag='/tmp/fault_overtemp' + +# gpio fault + gpio_fault="false" + gpio_fault_flag="/tmp/gpio_fault" + +# fan variables + fan_failed="false" + fan_failed_flag='/tmp/fan_failed' + +# PSU variables + psu_failed="false" + psu_bus=2 + psu0_addr=0x58 + psu1_addr=0x59 + status_word_cmd=0x79 + # Following the PMBus Specification + # Bit[1]: CML faults + # Bit[2]: Over temperature faults + # Bit[3]: Under voltage faults + # Bit[4]: Over current faults + # Bit[5]: Over voltage fault + # Bit[10]: Fan faults + psu_fault_bitmask=0x43e + +# led variables + fan_fault_led_status=$off + psu_fault_led_status=$off + led_bus=15 + led_addr=0x22 + led_port0_config=0x06 + led_port0_output=0x02 + +# functions declaration +check_fan_failed() { + if [[ -f $fan_failed_flag ]]; then + fan_failed="true" + else + fan_failed="false" + fi +} + +turn_on_off_fan_fault_led() { + # Control fan fault led via CPLD's I2C at slave address 0x22, I2C16. + # Get Port0 value + p0_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) + p0_val=$(("$p0_val" & ~1)) + # Config CPLD's IOepx Port0[0] from input to output, clear IOepx Port0[0]. + i2cset -f -y $led_bus $led_addr $led_port0_config $p0_val + + # Get led value + led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) + + if [ "$1" == $on ]; then + led_st=$(("$led_st" | 1)) + else + led_st=$(("$led_st" & ~1)) + fi + + # Turn on/off fan fault led + i2cset -f -y $led_bus $led_addr $led_port0_output $led_st +} + +turn_on_off_psu_fault_led() { + # Control psu fault led via CPLD's I2C at slave address 0x22, I2C16. + # Get Port1 value + p1_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) + p1_val=$(("$p1_val" & ~2)) + # Config CPLD's IOepx Port0[1] from input to output, clear IOepx Port0[1]. + i2cset -f -y $led_bus $led_addr $led_port0_config $p1_val + + # Get led value + led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) + if [ "$1" == $on ]; then + led_st=$(("$led_st" | 2)) + else + led_st=$(("$led_st" & ~2)) + fi + + # Turn on/off psu fault led + i2cset -f -y $led_bus $led_addr $led_port0_output $led_st +} + +control_fan_fault_led() { + if [ "$fan_failed" == "true" ]; then + if [ "$fan_fault_led_status" == $off ]; then + turn_on_off_fan_fault_led $on + fan_fault_led_status=$on + fi + else + if [ "$fan_fault_led_status" == $on ]; then + turn_on_off_fan_fault_led $off + fan_fault_led_status=$off + fi + fi +} + +check_psu_failed() { + local psu0_presence + local psu1_presence + local psu0_value + local psu1_value + + psu0_presence=$(gpio_name_get presence-ps0) + psu0_failed="true" + if [ "$psu0_presence" == "0" ]; then + # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD + psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w) + psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask)) + if [ "$psu0_bit_fault" == "0" ]; then + psu0_failed="false" + fi + fi + + psu1_presence=$(gpio_name_get presence-ps1) + psu1_failed="true" + if [ "$psu1_presence" == "0" ]; then + # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD + psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w) + psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask)) + if [ "$psu1_bit_fault" == "0" ]; then + psu1_failed="false" + fi + fi + + if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then + psu_failed="true" + else + psu_failed="false" + fi +} + +control_psu_fault_led() { + if [ "$psu_failed" == "true" ]; then + if [ "$psu_fault_led_status" == $off ]; then + turn_on_off_psu_fault_led $on + psu_fault_led_status=$on + fi + else + if [ "$psu_fault_led_status" == $on ]; then + turn_on_off_psu_fault_led $off + psu_fault_led_status=$off + fi + fi +} + +check_overtemp_occured() { + if [[ -f $overtemp_fault_flag ]]; then + echo "Over temperature occured, turn on fault LED" + overtemp_occured="true" + else + overtemp_occured="false" + fi +} + + +check_gpio_fault() { + if [[ -f $gpio_fault_flag ]]; then + echo "GPIO fault event(s) occured, turn on fault LED" + gpio_fault="true" + else + gpio_fault="false" + fi +} + +check_fault() { + if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \ + || [[ "$overtemp_occured" == "true" ]] \ + || [[ "$gpio_fault" == "true" ]]; then + fault="true" + else + fault="false" + fi +} + +# The System Fault Led turns on upon the system error, update the System Fault Led +# based on the Fan fault status and PSU fault status +control_sys_fault_led() { + # Turn on/off the System Fault Led + if [ "$fault" == "true" ]; then + gpio_name_set led-fault $on + else + gpio_name_set led-fault $off + fi +} + +# daemon start +while true +do + # Monitors Fan speeds + check_fan_failed + # Monitors PSU presence + check_psu_failed + + check_overtemp_occured + check_gpio_fault + # Check fault to update fail + check_fault + control_sys_fault_led + + control_fan_fault_led + control_psu_fault_led + + sleep 2 +done + +exit 1 |