From 1f86cb270669029d4e0483db572a553cca6649be Mon Sep 17 00:00:00 2001 From: Al Stone Date: Jul 25 2020 00:32:40 +0000 Subject: New upstream version 0.6.6 --- diff --git a/ChangeLog b/ChangeLog index 09ef95e..c8591c8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2020-07-21 Mauro Carvalho Chehab + - Version 0.6.6 + + * Support for new AMD SMCA bank types + * Add decoders for more hip08 events + * Add support for memory Corrected Error predictive failure analysis + * Some bugs fixed + 2019-11-20 Mauro Carvalho Chehab - Version 0.6.5 diff --git a/Makefile.am b/Makefile.am index 843b538..f4822b9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS=-I m4 SUBDIRS = libtrace util man SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) -EXTRA_DIST = $(SYSTEMD_SERVICES_IN) +EXTRA_DIST = $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin # during ./configure phase, therefore it is not possible to add .service.in @@ -54,12 +54,15 @@ endif if WITH_HISI_NS_DECODE rasdaemon_SOURCES += non-standard-hisi_hip07.c non-standard-hisi_hip08.c endif +if WITH_MEMORY_CE_PFA + rasdaemon_SOURCES += rbtree.c ras-page-isolation.c +endif rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that @@ -87,3 +90,6 @@ upload: install-data-local: $(install_sh) -d "$(DESTDIR)@RASSTATEDIR@" $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" +if WITH_MEMORY_CE_PFA + $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@sysconfdir@/sysconfig/rasdaemon" +endif diff --git a/Makefile.in b/Makefile.in index b99908a..cec531a 100644 --- a/Makefile.in +++ b/Makefile.in @@ -108,6 +108,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT) @WITH_DISKERROR_TRUE@am__append_8 = ras-diskerror-handler.c @WITH_ABRT_REPORT_TRUE@am__append_9 = ras-report.c @WITH_HISI_NS_DECODE_TRUE@am__append_10 = non-standard-hisi_hip07.c non-standard-hisi_hip08.c +@WITH_MEMORY_CE_PFA_TRUE@am__append_11 = rbtree.c ras-page-isolation.c subdir = . ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \ @@ -137,7 +138,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c ras-events.c ras-mc-handler.c \ mce-intel-skylake-xeon.c mce-amd-k8.c mce-amd-smca.c \ ras-extlog-handler.c ras-devlink-handler.c \ ras-diskerror-handler.c ras-report.c non-standard-hisi_hip07.c \ - non-standard-hisi_hip08.c + non-standard-hisi_hip08.c rbtree.c ras-page-isolation.c @WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT) @WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT) @WITH_NON_STANDARD_TRUE@am__objects_3 = \ @@ -163,11 +164,14 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c ras-events.c ras-mc-handler.c \ @WITH_HISI_NS_DECODE_TRUE@am__objects_10 = \ @WITH_HISI_NS_DECODE_TRUE@ non-standard-hisi_hip07.$(OBJEXT) \ @WITH_HISI_NS_DECODE_TRUE@ non-standard-hisi_hip08.$(OBJEXT) +@WITH_MEMORY_CE_PFA_TRUE@am__objects_11 = rbtree.$(OBJEXT) \ +@WITH_MEMORY_CE_PFA_TRUE@ ras-page-isolation.$(OBJEXT) am_rasdaemon_OBJECTS = rasdaemon.$(OBJEXT) ras-events.$(OBJEXT) \ ras-mc-handler.$(OBJEXT) bitfield.$(OBJEXT) $(am__objects_1) \ $(am__objects_2) $(am__objects_3) $(am__objects_4) \ $(am__objects_5) $(am__objects_6) $(am__objects_7) \ - $(am__objects_8) $(am__objects_9) $(am__objects_10) + $(am__objects_8) $(am__objects_9) $(am__objects_10) \ + $(am__objects_11) rasdaemon_OBJECTS = $(am_rasdaemon_OBJECTS) am__DEPENDENCIES_1 = rasdaemon_DEPENDENCIES = $(am__DEPENDENCIES_1) libtrace/libtrace.a @@ -208,8 +212,9 @@ am__depfiles_remade = ./$(DEPDIR)/bitfield.Po \ ./$(DEPDIR)/ras-extlog-handler.Po \ ./$(DEPDIR)/ras-mc-handler.Po ./$(DEPDIR)/ras-mce-handler.Po \ ./$(DEPDIR)/ras-non-standard-handler.Po \ - ./$(DEPDIR)/ras-record.Po ./$(DEPDIR)/ras-report.Po \ - ./$(DEPDIR)/rasdaemon.Po + ./$(DEPDIR)/ras-page-isolation.Po ./$(DEPDIR)/ras-record.Po \ + ./$(DEPDIR)/ras-report.Po ./$(DEPDIR)/rasdaemon.Po \ + ./$(DEPDIR)/rbtree.Po am__mv = mv -f COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) @@ -431,6 +436,7 @@ WITH_DISKERROR = @WITH_DISKERROR@ WITH_EXTLOG = @WITH_EXTLOG@ WITH_HISI_NS_DECODE = @WITH_HISI_NS_DECODE@ WITH_MCE = @WITH_MCE@ +WITH_MEMORY_CE_PFA = @WITH_MEMORY_CE_PFA@ WITH_NON_STANDARD = @WITH_NON_STANDARD@ WITH_SQLITE3 = @WITH_SQLITE3@ abs_builddir = @abs_builddir@ @@ -494,7 +500,7 @@ ACLOCAL_AMFLAGS = -I m4 SUBDIRS = libtrace util man SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) -EXTRA_DIST = $(SYSTEMD_SERVICES_IN) +EXTRA_DIST = $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin # during ./configure phase, therefore it is not possible to add .service.in @@ -504,12 +510,12 @@ rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ bitfield.c $(am__append_1) $(am__append_2) $(am__append_3) \ $(am__append_4) $(am__append_5) $(am__append_6) \ $(am__append_7) $(am__append_8) $(am__append_9) \ - $(am__append_10) + $(am__append_10) $(am__append_11) rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h all: config.h $(MAKE) $(AM_MAKEFLAGS) all-recursive @@ -653,9 +659,11 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ras-mc-handler.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ras-mce-handler.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ras-non-standard-handler.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ras-page-isolation.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ras-record.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ras-report.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rasdaemon.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rbtree.Po@am__quote@ # am--include-marker $(am__depfiles_remade): @$(MKDIR_P) $(@D) @@ -1083,9 +1091,11 @@ distclean: distclean-recursive -rm -f ./$(DEPDIR)/ras-mc-handler.Po -rm -f ./$(DEPDIR)/ras-mce-handler.Po -rm -f ./$(DEPDIR)/ras-non-standard-handler.Po + -rm -f ./$(DEPDIR)/ras-page-isolation.Po -rm -f ./$(DEPDIR)/ras-record.Po -rm -f ./$(DEPDIR)/ras-report.Po -rm -f ./$(DEPDIR)/rasdaemon.Po + -rm -f ./$(DEPDIR)/rbtree.Po -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-hdr distclean-libtool distclean-tags @@ -1160,9 +1170,11 @@ maintainer-clean: maintainer-clean-recursive -rm -f ./$(DEPDIR)/ras-mc-handler.Po -rm -f ./$(DEPDIR)/ras-mce-handler.Po -rm -f ./$(DEPDIR)/ras-non-standard-handler.Po + -rm -f ./$(DEPDIR)/ras-page-isolation.Po -rm -f ./$(DEPDIR)/ras-record.Po -rm -f ./$(DEPDIR)/ras-report.Po -rm -f ./$(DEPDIR)/rasdaemon.Po + -rm -f ./$(DEPDIR)/rbtree.Po -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic @@ -1238,6 +1250,7 @@ upload: install-data-local: $(install_sh) -d "$(DESTDIR)@RASSTATEDIR@" $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" +@WITH_MEMORY_CE_PFA_TRUE@ $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@sysconfdir@/sysconfig/rasdaemon" # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/config.h b/config.h index 3c1e208..94689f3 100644 --- a/config.h +++ b/config.h @@ -31,6 +31,9 @@ /* "have PCIe MCE events collect" */ #define HAVE_MCE 1 +/* "have memory corrected error predictive failure analysis" */ +#define HAVE_MEMORY_CE_PFA 1 + /* Define to 1 if you have the header file. */ #define HAVE_MEMORY_H 1 @@ -101,7 +104,7 @@ #define PACKAGE_NAME "RASdaemon" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "RASdaemon 0.6.5" +#define PACKAGE_STRING "RASdaemon 0.6.6" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "rasdaemon" @@ -110,7 +113,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "0.6.5" +#define PACKAGE_VERSION "0.6.6" /* rasdaemon db store state dir */ #define RASSTATEDIR "/usr/local/var/lib/rasdaemon" @@ -122,4 +125,4 @@ #define STDC_HEADERS 1 /* Version number of package */ -#define VERSION "0.6.5" +#define VERSION "0.6.6" diff --git a/config.h.in b/config.h.in index 27e0184..c308096 100644 --- a/config.h.in +++ b/config.h.in @@ -30,6 +30,9 @@ /* "have PCIe MCE events collect" */ #undef HAVE_MCE +/* "have memory corrected error predictive failure analysis" */ +#undef HAVE_MEMORY_CE_PFA + /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H diff --git a/configure b/configure index d80bc92..3512794 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for RASdaemon 0.6.5. +# Generated by GNU Autoconf 2.69 for RASdaemon 0.6.6. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -587,8 +587,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='RASdaemon' PACKAGE_TARNAME='rasdaemon' -PACKAGE_VERSION='0.6.5' -PACKAGE_STRING='RASdaemon 0.6.5' +PACKAGE_VERSION='0.6.6' +PACKAGE_STRING='RASdaemon 0.6.6' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -635,6 +635,9 @@ LIBOBJS RAS_DB_FNAME RASSTATEDIR rasstatedir +WITH_MEMORY_CE_PFA_FALSE +WITH_MEMORY_CE_PFA_TRUE +WITH_MEMORY_CE_PFA WITH_HISI_NS_DECODE_FALSE WITH_HISI_NS_DECODE_TRUE WITH_HISI_NS_DECODE @@ -816,6 +819,7 @@ enable_devlink enable_diskerror enable_abrt_report enable_hisi_ns_decode +enable_memory_ce_pfa ' ac_precious_vars='build_alias host_alias @@ -1367,7 +1371,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures RASdaemon 0.6.5 to adapt to many kinds of systems. +\`configure' configures RASdaemon 0.6.6 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1438,7 +1442,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of RASdaemon 0.6.5:";; + short | recursive ) echo "Configuration of RASdaemon 0.6.6:";; esac cat <<\_ACEOF @@ -1472,6 +1476,8 @@ Optional Features: --enable-abrt-report enable report event to ABRT (currently experimental) --enable-hisi-ns-decode enable HISI_NS_DECODE events (currently experimental) + --enable-memory-ce-pfa enable memory Corrected Error predictive failure + analysis Optional Packages: --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] @@ -1563,7 +1569,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -RASdaemon configure 0.6.5 +RASdaemon configure 0.6.6 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -1841,7 +1847,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by RASdaemon $as_me 0.6.5, which was +It was created by RASdaemon $as_me 0.6.6, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2818,7 +2824,7 @@ fi # Define the identity of the package. PACKAGE='rasdaemon' - VERSION='0.6.5' + VERSION='0.6.6' cat >>confdefs.h <<_ACEOF @@ -12468,6 +12474,34 @@ else USE_HISI_NS_DECODE="no" fi +# Check whether --enable-memory_ce_pfa was given. +if test "${enable_memory_ce_pfa+set}" = set; then : + enableval=$enable_memory_ce_pfa; +fi + + +if test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"; then : + + +$as_echo "#define HAVE_MEMORY_CE_PFA 1" >>confdefs.h + + + +fi + if test x$enable_memory_ce_pfa = xyes || test x$enable_all == xyes; then + WITH_MEMORY_CE_PFA_TRUE= + WITH_MEMORY_CE_PFA_FALSE='#' +else + WITH_MEMORY_CE_PFA_TRUE='#' + WITH_MEMORY_CE_PFA_FALSE= +fi + +if test -z "$WITH_MEMORY_CE_PFA_TRUE"; then : + USE_MEMORY_CE_PFA="yes" +else + USE_MEMORY_CE_PFA="no" +fi + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" @@ -12672,6 +12706,10 @@ if test -z "${WITH_HISI_NS_DECODE_TRUE}" && test -z "${WITH_HISI_NS_DECODE_FALSE as_fn_error $? "conditional \"WITH_HISI_NS_DECODE\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${WITH_MEMORY_CE_PFA_TRUE}" && test -z "${WITH_MEMORY_CE_PFA_FALSE}"; then + as_fn_error $? "conditional \"WITH_MEMORY_CE_PFA\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 @@ -13069,7 +13107,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by RASdaemon $as_me 0.6.5, which was +This file was extended by RASdaemon $as_me 0.6.6, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -13135,7 +13173,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -RASdaemon config.status 0.6.5 +RASdaemon config.status 0.6.6 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" @@ -14845,4 +14883,5 @@ compile time options summary ARM events : $USE_ARM DEVLINK : $USE_DEVLINK Disk I/O errors : $USE_DISKERROR + Memory CE PFA : $USE_MEMORY_CE_PFA EOF diff --git a/configure.ac b/configure.ac index a5a04dc..2d6c59c 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([RASdaemon], 0.6.5) +AC_INIT([RASdaemon], 0.6.6) AM_SILENT_RULES([yes]) AC_CANONICAL_SYSTEM AC_CONFIG_MACRO_DIR([m4]) @@ -131,6 +131,16 @@ AS_IF([test "x$enable_hisi_ns_decode" = "xyes" || test "x$enable_all" == "xyes"] AM_CONDITIONAL([WITH_HISI_NS_DECODE], [test x$enable_hisi_ns_decode = xyes || test x$enable_all == xyes]) AM_COND_IF([WITH_HISI_NS_DECODE], [USE_HISI_NS_DECODE="yes"], [USE_HISI_NS_DECODE="no"]) +AC_ARG_ENABLE([memory_ce_pfa], + AS_HELP_STRING([--enable-memory-ce-pfa], [enable memory Corrected Error predictive failure analysis])) + +AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], [ + AC_DEFINE(HAVE_MEMORY_CE_PFA,1,"have memory corrected error predictive failure analysis") + AC_SUBST([WITH_MEMORY_CE_PFA]) +]) +AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all == xyes]) +AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"]) + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" @@ -162,4 +172,5 @@ compile time options summary ARM events : $USE_ARM DEVLINK : $USE_DEVLINK Disk I/O errors : $USE_DISKERROR + Memory CE PFA : $USE_MEMORY_CE_PFA EOF diff --git a/libtrace/Makefile.in b/libtrace/Makefile.in index a6ce864..2ae2f80 100644 --- a/libtrace/Makefile.in +++ b/libtrace/Makefile.in @@ -298,6 +298,7 @@ WITH_DISKERROR = @WITH_DISKERROR@ WITH_EXTLOG = @WITH_EXTLOG@ WITH_HISI_NS_DECODE = @WITH_HISI_NS_DECODE@ WITH_MCE = @WITH_MCE@ +WITH_MEMORY_CE_PFA = @WITH_MEMORY_CE_PFA@ WITH_NON_STANDARD = @WITH_NON_STANDARD@ WITH_SQLITE3 = @WITH_SQLITE3@ abs_builddir = @abs_builddir@ diff --git a/man/Makefile.in b/man/Makefile.in index bbc9ecf..777c12d 100644 --- a/man/Makefile.in +++ b/man/Makefile.in @@ -240,6 +240,7 @@ WITH_DISKERROR = @WITH_DISKERROR@ WITH_EXTLOG = @WITH_EXTLOG@ WITH_HISI_NS_DECODE = @WITH_HISI_NS_DECODE@ WITH_MCE = @WITH_MCE@ +WITH_MEMORY_CE_PFA = @WITH_MEMORY_CE_PFA@ WITH_NON_STANDARD = @WITH_NON_STANDARD@ WITH_SQLITE3 = @WITH_SQLITE3@ abs_builddir = @abs_builddir@ diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in index 834df16..833c8e1 100644 --- a/man/rasdaemon.1.in +++ b/man/rasdaemon.1.in @@ -62,6 +62,13 @@ feature. .BI "--version" Print the program version and exit. +.SH CONFIG FILE + +The \fBrasdaemon\fR program supports a config file to set rasdaemon systemd service +environment variables. By default the config file is read from /etc/sysconfig/rasdaemon. + +The general format is environmentname=value. + .SH SEE ALSO \fBras-mc-ctl\fR(8) diff --git a/mce-amd-smca.c b/mce-amd-smca.c index 6c3e8a5..114e786 100644 --- a/mce-amd-smca.c +++ b/mce-amd-smca.c @@ -49,11 +49,17 @@ enum smca_bank_types { SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ SMCA_CS, /* Coherent Slave */ + SMCA_CS_V2, /* Coherent Slave V2 */ SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ + SMCA_PSP_V2, /* Platform Security Processor V2 */ SMCA_SMU, /* System Management Unit */ + SMCA_SMU_V2, /* System Management Unit V2 */ + SMCA_MP5, /* Microprocessor 5 Unit */ + SMCA_NBIO, /* Northbridge IO Unit */ + SMCA_PCIE, /* PCI Express Unit */ N_SMCA_BANK_TYPES }; @@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = { "Atomic request parity", "ECC error on probe filter access", }; +/* Coherent Slave Unit V2 */ +static const char * const smca_cs2_mce_desc[] = { + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "SDP read response had no match in the CS queue", + "Probe Filter Protocol Error", + "Probe Filter ECC Error", + "SDP read response had an unexpected RETRY error", + "Counter overflow error", + "Counter underflow error", +}; /* Power, Interrupt, etc.. */ static const char * const smca_pie_mce_desc[] = { "HW assert", @@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = { static const char * const smca_psp_mce_desc[] = { "PSP RAM ECC or parity error", }; +/* Platform Security Processor V2 */ +static const char * const smca_psp2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Instruction Cache Bank 0 ECC or parity error", + "Instruction Cache Bank 1 ECC or parity error", + "Instruction Tag Ram 0 parity error", + "Instruction Tag Ram 1 parity error", + "Data Cache Bank 0 ECC or parity error", + "Data Cache Bank 1 ECC or parity error", + "Data Cache Bank 2 ECC or parity error", + "Data Cache Bank 3 ECC or parity error", + "Data Tag Bank 0 parity error", + "Data Tag Bank 1 parity error", + "Data Tag Bank 2 parity error", + "Data Tag Bank 3 parity error", + "Dirty Data Ram parity error", + "TLB Bank 0 parity error", + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", +}; /* System Management Unit */ static const char * const smca_smu_mce_desc[] = { "SMU RAM ECC or parity error", }; +/* System Management Unit V2 */ +static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", +}; +/* Microprocessor 5 Unit */ +static const char * const smca_mp5_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", +}; +/* Northbridge IO Unit */ +static const char * const smca_nbio_mce_desc[] = { + "ECC or Parity error", + "PCIE error", + "SDP ErrEvent error", + "SDP Egress Poison Error", + "IOHC Internal Poison Error", +}; +/* PCI Express Unit */ +static const char * const smca_pcie_mce_desc[] = { + "CCIX PER Message logging", + "CCIX Read Response with Status: Non-Data Error", + "CCIX Write Response with Status: Non-Data Error", + "CCIX Read Response with Status: Data Error", + "CCIX Non-okay write response with data error", +}; + struct smca_mce_desc { const char * const *descs; @@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)}, + [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, + [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, + [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, }; struct smca_hwid { @@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Data Fabric MCA types */ { SMCA_CS, 0x0000002E }, + { SMCA_CS_V2, 0x0002002E }, { SMCA_PIE, 0x0001002E }, /* Unified Memory Controller MCA type */ @@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Platform Security Processor MCA type */ { SMCA_PSP, 0x000000FF }, + { SMCA_PSP_V2, 0x000100FF }, /* System Management Unit MCA type */ { SMCA_SMU, 0x00000001 }, + { SMCA_SMU_V2, 0x00010001 }, + + /* Microprocessor 5 Unit MCA type */ + { SMCA_MP5, 0x00020001 }, + + /* Northbridge IO Unit MCA type */ + { SMCA_NBIO, 0x00000018 }, + + /* PCI Express Unit MCA type */ + { SMCA_PCIE, 0x00000046 }, }; struct smca_bank_name { @@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = { [SMCA_FP] = { "Floating Point Unit" }, [SMCA_L3_CACHE] = { "L3 Cache" }, [SMCA_CS] = { "Coherent Slave" }, + [SMCA_CS_V2] = { "Coherent Slave" }, [SMCA_PIE] = { "Power, Interrupts, etc." }, [SMCA_UMC] = { "Unified Memory Controller" }, [SMCA_PB] = { "Parameter Block" }, [SMCA_PSP] = { "Platform Security Processor" }, + [SMCA_PSP_V2] = { "Platform Security Processor" }, [SMCA_SMU] = { "System Management Unit" }, + [SMCA_SMU_V2] = { "System Management Unit" }, + [SMCA_MP5] = { "Microprocessor 5 Unit" }, + [SMCA_NBIO] = { "Northbridge IO Unit" }, + [SMCA_PCIE] = { "PCI Express Unit" }, }; static void amd_decode_errcode(struct mce_event *e) diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env new file mode 100644 index 0000000..12fd766 --- /dev/null +++ b/misc/rasdaemon.env @@ -0,0 +1,29 @@ +# Page Isolation +# Note: Run-time configuration is unsupported, service restart needed. +# Note: this file should be installed at /etc/sysconfig/rasdaemon + +# Specify the threshold of isolating buggy pages. +# +# Format: +# [0-9]+[unit] +# Notice: please make sure match this format, rasdaemon will use default value for exception input cases. +# +# Supported units: +# PAGE_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour +# PAGE_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none +# +# The two configs will only take no effect when PAGE_CE_ACTION is "off". +PAGE_CE_REFRESH_CYCLE="24h" +PAGE_CE_THRESHOLD="50" + +# Specify the internal action in rasdaemon to exceeding a page error threshold. +# +# off no action +# account only account errors +# soft try to soft-offline page without killing any processes +# This requires an uptodate kernel. Might not be successfull. +# hard try to hard-offline page by killing processes +# Requires an uptodate kernel. Might not be successfull. +# soft-then-hard First try to soft offline, then try hard offlining. +# Note: default offline choice is "soft". +PAGE_CE_ACTION="soft" diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in index be9ad5a..e73a08a 100644 --- a/misc/rasdaemon.service.in +++ b/misc/rasdaemon.service.in @@ -3,6 +3,7 @@ Description=RAS daemon to log the RAS events After=syslog.target [Service] +EnvironmentFile=/etc/sysconfig/rasdaemon ExecStart=@sbindir@/rasdaemon -f -r ExecStartPost=@sbindir@/rasdaemon --enable ExecStop=@sbindir@/rasdaemon --disable diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in index 48dd311..b2572f5 100644 --- a/misc/rasdaemon.spec.in +++ b/misc/rasdaemon.spec.in @@ -56,13 +56,18 @@ rm INSTALL %{buildroot}/usr/include/*.h %{_unitdir}/*.service %{_sharedstatedir}/rasdaemon %{_sysconfdir}/ras/dimm_labels.d +%{_sysconfdir}/sysconfig/%{name} +%config(noreplace) %{_sysconfdir}/sysconfig/%{name} %changelog +* Tue Jul 21 2020 Mauro Carvalho Chehab 0.6.6-1 +- Bump to version 0.6.5 with several fixes, new hip08 events and memory prediction analysis + * Wed Nov 20 2019 Mauro Carvalho Chehab 0.6.5-1 - Bump to version 0.6.5 with several fixes and improves PCIe events record -* Fri Oct 10 2019 Mauro Carvalho Chehab 0.6.4-1 +* Thu Oct 10 2019 Mauro Carvalho Chehab 0.6.4-1 - Bump to version 0.6.4 with some DB changes for hip08 and some fixes * Fri Aug 23 2019 Mauro Carvalho Chehab 0.6.3-1 diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c index 1774ec7..8bf10c1 100644 --- a/non-standard-hisi_hip08.c +++ b/non-standard-hisi_hip08.c @@ -52,6 +52,8 @@ #define HISI_OEM_MODULE_ID_PA 2 #define HISI_OEM_MODULE_ID_HLLC 3 #define HISI_OEM_MODULE_ID_DDRC 4 +#define HISI_OEM_MODULE_ID_L3T 5 +#define HISI_OEM_MODULE_ID_L3D 6 #define HISI_OEM_TYPE2_VALID_ERR_FR BIT(6) #define HISI_OEM_TYPE2_VALID_ERR_CTRL BIT(7) @@ -78,6 +80,14 @@ #define HISI_PCIE_LOCAL_VALID_ERR_SEVERITY BIT(8) #define HISI_PCIE_LOCAL_VALID_ERR_MISC 9 +#define HISI_PCIE_LOCAL_ERR_MISC_MAX 33 +#define HISI_BUF_LEN 1024 + +#define HISI_ERR_SEVERITY_NFE 0 +#define HISI_ERR_SEVERITY_FE 1 +#define HISI_ERR_SEVERITY_CE 2 +#define HISI_ERR_SEVERITY_NONE 3 + struct hisi_oem_type1_err_sec { uint32_t val_bits; uint8_t version; @@ -132,99 +142,208 @@ struct hisi_pcie_local_err_sec { uint8_t err_severity; uint16_t err_type; uint8_t reserv[2]; - uint32_t err_misc[33]; + uint32_t err_misc[HISI_PCIE_LOCAL_ERR_MISC_MAX]; }; enum hisi_oem_data_type { - hisi_oem_data_type_int, - hisi_oem_data_type_int64, - hisi_oem_data_type_text, + HISI_OEM_DATA_TYPE_INT, + HISI_OEM_DATA_TYPE_INT64, + HISI_OEM_DATA_TYPE_TEXT, }; enum { - hip08_oem_type1_field_id, - hip08_oem_type1_field_timestamp, - hip08_oem_type1_field_version, - hip08_oem_type1_field_soc_id, - hip08_oem_type1_field_socket_id, - hip08_oem_type1_field_nimbus_id, - hip08_oem_type1_field_module_id, - hip08_oem_type1_field_sub_module_id, - hip08_oem_type1_field_err_sev, - hip08_oem_type1_field_regs_dump, + HIP08_OEM_TYPE1_FIELD_ID, + HIP08_OEM_TYPE1_FIELD_TIMESTAMP, + HIP08_OEM_TYPE1_FIELD_VERSION, + HIP08_OEM_TYPE1_FIELD_SOC_ID, + HIP08_OEM_TYPE1_FIELD_SOCKET_ID, + HIP08_OEM_TYPE1_FIELD_NIMBUS_ID, + HIP08_OEM_TYPE1_FIELD_MODULE_ID, + HIP08_OEM_TYPE1_FIELD_SUB_MODULE_ID, + HIP08_OEM_TYPE1_FIELD_ERR_SEV, + HIP08_OEM_TYPE1_FIELD_REGS_DUMP, }; enum { - hip08_oem_type2_field_id, - hip08_oem_type2_field_timestamp, - hip08_oem_type2_field_version, - hip08_oem_type2_field_soc_id, - hip08_oem_type2_field_socket_id, - hip08_oem_type2_field_nimbus_id, - hip08_oem_type2_field_module_id, - hip08_oem_type2_field_sub_module_id, - hip08_oem_type2_field_err_sev, - hip08_oem_type2_field_regs_dump, + HIP08_OEM_TYPE2_FIELD_ID, + HIP08_OEM_TYPE2_FIELD_TIMESTAMP, + HIP08_OEM_TYPE2_FIELD_VERSION, + HIP08_OEM_TYPE2_FIELD_SOC_ID, + HIP08_OEM_TYPE2_FIELD_SOCKET_ID, + HIP08_OEM_TYPE2_FIELD_NIMBUS_ID, + HIP08_OEM_TYPE2_FIELD_MODULE_ID, + HIP08_OEM_TYPE2_FIELD_SUB_MODULE_ID, + HIP08_OEM_TYPE2_FIELD_ERR_SEV, + HIP08_OEM_TYPE2_FIELD_REGS_DUMP, }; enum { - hip08_pcie_local_field_id, - hip08_pcie_local_field_timestamp, - hip08_pcie_local_field_version, - hip08_pcie_local_field_soc_id, - hip08_pcie_local_field_socket_id, - hip08_pcie_local_field_nimbus_id, - hip08_pcie_local_field_sub_module_id, - hip08_pcie_local_field_core_id, - hip08_pcie_local_field_port_id, - hip08_pcie_local_field_err_sev, - hip08_pcie_local_field_err_type, - hip08_pcie_local_field_regs_dump, + HIP08_PCIE_LOCAL_FIELD_ID, + HIP08_PCIE_LOCAL_FIELD_TIMESTAMP, + HIP08_PCIE_LOCAL_FIELD_VERSION, + HIP08_PCIE_LOCAL_FIELD_SOC_ID, + HIP08_PCIE_LOCAL_FIELD_SOCKET_ID, + HIP08_PCIE_LOCAL_FIELD_NIMBUS_ID, + HIP08_PCIE_LOCAL_FIELD_SUB_MODULE_ID, + HIP08_PCIE_LOCAL_FIELD_CORE_ID, + HIP08_PCIE_LOCAL_FIELD_PORT_ID, + HIP08_PCIE_LOCAL_FIELD_ERR_SEV, + HIP08_PCIE_LOCAL_FIELD_ERR_TYPE, + HIP08_PCIE_LOCAL_FIELD_REGS_DUMP, +}; + +struct hisi_module_info { + int id; + const char *name; + const char **sub; + int sub_num; }; /* helper functions */ static char *err_severity(uint8_t err_sev) { switch (err_sev) { - case 0: return "recoverable"; - case 1: return "fatal"; - case 2: return "corrected"; - case 3: return "none"; + case HISI_ERR_SEVERITY_NFE: return "recoverable"; + case HISI_ERR_SEVERITY_FE: return "fatal"; + case HISI_ERR_SEVERITY_CE: return "corrected"; + case HISI_ERR_SEVERITY_NONE: return "none"; + default: + break; } return "unknown"; } -static char *oem_type1_module_name(uint8_t module_id) -{ - switch (module_id) { - case HISI_OEM_MODULE_ID_MN: return "MN"; - case HISI_OEM_MODULE_ID_PLL: return "PLL"; - case HISI_OEM_MODULE_ID_SLLC: return "SLLC"; - case HISI_OEM_MODULE_ID_AA: return "AA"; - case HISI_OEM_MODULE_ID_SIOE: return "SIOE"; - case HISI_OEM_MODULE_ID_POE: return "POE"; - case HISI_OEM_MODULE_ID_DISP: return "DISP"; - case HISI_OEM_MODULE_ID_LPC: return "LPC"; - case HISI_OEM_MODULE_ID_GIC: return "GIC"; - case HISI_OEM_MODULE_ID_RDE: return "RDE"; - case HISI_OEM_MODULE_ID_SAS: return "SAS"; - case HISI_OEM_MODULE_ID_SATA: return "SATA"; - case HISI_OEM_MODULE_ID_USB: return "USB"; - } - return "unknown"; -} +static const char *pll_submodule_name[] = { + "TB_PLL0", + "TB_PLL1", + "TB_PLL2", + "TB_PLL3", + "TA_PLL0", + "TA_PLL1", + "TA_PLL2", + "TA_PLL3", + "NIMBUS_PLL0", + "NIMBUS_PLL1", + "NIMBUS_PLL2", + "NIMBUS_PLL3", + "NIMBUS_PLL4", +}; -static char *oem_type2_module_name(uint8_t module_id) -{ - switch (module_id) { - case HISI_OEM_MODULE_ID_SMMU: return "SMMU"; - case HISI_OEM_MODULE_ID_HHA: return "HHA"; - case HISI_OEM_MODULE_ID_HLLC: return "HLLC"; - case HISI_OEM_MODULE_ID_PA: return "PA"; - case HISI_OEM_MODULE_ID_DDRC: return "DDRC"; - } - return "unknown module"; -} +static const char *sllc_submodule_name[] = { + "TB_SLLC0", + "TB_SLLC1", + "TB_SLLC2", + "TA_SLLC0", + "TA_SLLC1", + "TA_SLLC2", + "NIMBUS_SLLC0", + "NIMBUS_SLLC1", +}; + +static const char *sioe_submodule_name[] = { + "TB_SIOE0", + "TB_SIOE1", + "TB_SIOE2", + "TB_SIOE3", + "TA_SIOE0", + "TA_SIOE1", + "TA_SIOE2", + "TA_SIOE3", + "NIMBUS_SIOE0", + "NIMBUS_SIOE1", +}; + +static const char *poe_submodule_name[] = { + "TB_POE", + "TA_POE", +}; + +static const char *disp_submodule_name[] = { + "TB_PERI_DISP", + "TB_POE_DISP", + "TB_GIC_DISP", + "TA_PERI_DISP", + "TA_POE_DISP", + "TA_GIC_DISP", + "HAC_DISP", + "PCIE_DISP", + "IO_MGMT_DISP", + "NETWORK_DISP", +}; + +static const char *sas_submodule_name[] = { + "SAS0", + "SAS1", +}; + +static const struct hisi_module_info hisi_oem_type1_module[] = { + { + .id = HISI_OEM_MODULE_ID_PLL, + .name = "PLL", + .sub = pll_submodule_name, + .sub_num = ARRAY_SIZE(pll_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_SAS, + .name = "SAS", + .sub = sas_submodule_name, + .sub_num = ARRAY_SIZE(sas_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_POE, + .name = "POE", + .sub = poe_submodule_name, + .sub_num = ARRAY_SIZE(poe_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_SLLC, + .name = "SLLC", + .sub = sllc_submodule_name, + .sub_num = ARRAY_SIZE(sllc_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_SIOE, + .name = "SIOE", + .sub = sioe_submodule_name, + .sub_num = ARRAY_SIZE(sioe_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_DISP, + .name = "DISP", + .sub = disp_submodule_name, + .sub_num = ARRAY_SIZE(disp_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_MN, + .name = "MN", + }, + { + .id = HISI_OEM_MODULE_ID_AA, + .name = "AA", + }, + { + .id = HISI_OEM_MODULE_ID_LPC, + .name = "LPC", + }, + { + .id = HISI_OEM_MODULE_ID_GIC, + .name = "GIC", + }, + { + .id = HISI_OEM_MODULE_ID_RDE, + .name = "RDE", + }, + { + .id = HISI_OEM_MODULE_ID_SATA, + .name = "SATA", + }, + { + .id = HISI_OEM_MODULE_ID_USB, + .name = "USB", + }, + { + } +}; static const char *smmu_submodule_name[] = { "HAC_SMMU", @@ -240,44 +359,131 @@ static const char *hllc_submodule_name[] = { }; static const char *hha_submodule_name[] = { - "TA_HHA0", - "TA_HHA1", "TB_HHA0", - "TB_HHA1" + "TB_HHA1", + "TA_HHA0", + "TA_HHA1" }; static const char *ddrc_submodule_name[] = { - "TA_DDRC0", - "TA_DDRC1", - "TA_DDRC2", - "TA_DDRC3", "TB_DDRC0", "TB_DDRC1", "TB_DDRC2", "TB_DDRC3", + "TA_DDRC0", + "TA_DDRC1", + "TA_DDRC2", + "TA_DDRC3", +}; + +static const char *l3tag_submodule_name[] = { + "TB_PARTITION0", + "TB_PARTITION1", + "TB_PARTITION2", + "TB_PARTITION3", + "TB_PARTITION4", + "TB_PARTITION5", + "TB_PARTITION6", + "TB_PARTITION7", + "TA_PARTITION0", + "TA_PARTITION1", + "TA_PARTITION2", + "TA_PARTITION3", + "TA_PARTITION4", + "TA_PARTITION5", + "TA_PARTITION6", + "TA_PARTITION7", +}; + +static const char *l3data_submodule_name[] = { + "TB_BANK0", + "TB_BANK1", + "TB_BANK2", + "TB_BANK3", + "TA_BANK0", + "TA_BANK1", + "TA_BANK2", + "TA_BANK3", +}; + +static const struct hisi_module_info hisi_oem_type2_module[] = { + { + .id = HISI_OEM_MODULE_ID_SMMU, + .name = "SMMU", + .sub = smmu_submodule_name, + .sub_num = ARRAY_SIZE(smmu_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_HHA, + .name = "HHA", + .sub = hha_submodule_name, + .sub_num = ARRAY_SIZE(hha_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_PA, + .name = "PA", + }, + { + .id = HISI_OEM_MODULE_ID_HLLC, + .name = "HLLC", + .sub = hllc_submodule_name, + .sub_num = ARRAY_SIZE(hllc_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_DDRC, + .name = "DDRC", + .sub = ddrc_submodule_name, + .sub_num = ARRAY_SIZE(ddrc_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_L3T, + .name = "L3TAG", + .sub = l3tag_submodule_name, + .sub_num = ARRAY_SIZE(l3tag_submodule_name), + }, + { + .id = HISI_OEM_MODULE_ID_L3D, + .name = "L3DATA", + .sub = l3data_submodule_name, + .sub_num = ARRAY_SIZE(l3data_submodule_name), + }, + { + } }; -static const char *oem_type2_sub_module_name(uint8_t module_id, uint8_t sub_module_id) +static const char *oem_module_name(const struct hisi_module_info *info, + uint8_t module_id) { - switch (module_id) { - case HISI_OEM_MODULE_ID_SMMU: - if (sub_module_id < sizeof(smmu_submodule_name)/sizeof(char *)) - return smmu_submodule_name[sub_module_id]; - break; - case HISI_OEM_MODULE_ID_HLLC: - if (sub_module_id < sizeof(hllc_submodule_name)/sizeof(char *)) - return hllc_submodule_name[sub_module_id]; - break; - case HISI_OEM_MODULE_ID_PA: - return "PA"; - case HISI_OEM_MODULE_ID_HHA: - if (sub_module_id < sizeof(hha_submodule_name)/sizeof(char *)) - return hha_submodule_name[sub_module_id]; - break; - case HISI_OEM_MODULE_ID_DDRC: - if (sub_module_id < sizeof(ddrc_submodule_name)/sizeof(char *)) - return ddrc_submodule_name[sub_module_id]; - break; + const struct hisi_module_info *module = &info[0]; + + for (; module->name; module++) { + if (module->id != module_id) + continue; + + return module->name; + } + + return "unknown"; +} + +static const char *oem_submodule_name(const struct hisi_module_info *info, + uint8_t module_id, uint8_t sub_module_id) +{ + const struct hisi_module_info *module = &info[0]; + + for (; module->name; module++) { + const char **submodule = module->sub; + + if (module->id != module_id) + continue; + + if (module->sub == NULL) + return module->name; + + if (sub_module_id >= module->sub_num) + return "unknown"; + + return submodule[sub_module_id]; } return "unknown"; @@ -291,6 +497,8 @@ static char *pcie_local_sub_module_name(uint8_t id) case HISI_PCIE_SUB_MODULE_ID_MAC: return "MAC_Layer"; case HISI_PCIE_SUB_MODULE_ID_DL: return "DL_Layer"; case HISI_PCIE_SUB_MODULE_ID_SDI: return "SDI_Layer"; + default: + break; } return "unknown"; } @@ -360,13 +568,13 @@ static void record_vendor_data(struct ras_ns_dec_tab *dec_tab, int id, int64_t data, const char *text) { switch (data_type) { - case hisi_oem_data_type_int: + case HISI_OEM_DATA_TYPE_INT: sqlite3_bind_int(dec_tab->stmt_dec_record, id, data); break; - case hisi_oem_data_type_int64: + case HISI_OEM_DATA_TYPE_INT64: sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data); break; - case hisi_oem_data_type_text: + case HISI_OEM_DATA_TYPE_TEXT: sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL); break; default: @@ -374,7 +582,8 @@ static void record_vendor_data(struct ras_ns_dec_tab *dec_tab, } } -static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, char *name) +static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, + const char *name) { int rc; @@ -408,146 +617,141 @@ static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, char *name) } #endif -/* error data decoding functions */ -static int decode_hip08_oem_type1_error(struct ras_events *ras, - struct ras_ns_dec_tab *dec_tab, - struct trace_seq *s, - struct ras_non_standard_event *event) +#define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end)) +static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + const struct hisi_oem_type1_err_sec *err) { - const struct hisi_oem_type1_err_sec *err = - (struct hisi_oem_type1_err_sec*)event->error; - char buf[1024]; + char buf[HISI_BUF_LEN]; char *p = buf; + char *end = buf + HISI_BUF_LEN; - if (err->val_bits == 0) { - trace_seq_printf(s, "%s: no valid error information\n", - __func__); - return -1; - } + p += snprintf(p, end - p, "[ table_version=%d ", err->version); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE1_FIELD_VERSION, err->version, NULL); -#ifdef HAVE_SQLITE3 - if (!dec_tab->stmt_dec_record) { - if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, - &hip08_oem_type1_event_tab) - != SQLITE_OK) { - trace_seq_printf(s, - "create sql hip08_oem_type1_event_tab fail\n"); - return -1; - } - } -#endif - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type1_field_timestamp, - 0, event->timestamp); - - p += sprintf(p, "[ "); - p += sprintf(p, "table_version=%d ", err->version); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type1_field_version, err->version, NULL); - - if (err->val_bits & HISI_OEM_VALID_SOC_ID) { - p += sprintf(p, "SOC_ID=%d ", err->soc_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type1_field_soc_id, + if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE1_FIELD_SOC_ID, err->soc_id, NULL); } - if (err->val_bits & HISI_OEM_VALID_SOCKET_ID) { - p += sprintf(p, "socket_ID=%d ", err->socket_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type1_field_socket_id, + if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE1_FIELD_SOCKET_ID, err->socket_id, NULL); } - if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID) { - p += sprintf(p, "nimbus_ID=%d ", err->nimbus_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type1_field_nimbus_id, + if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE1_FIELD_NIMBUS_ID, err->nimbus_id, NULL); } - if (err->val_bits & HISI_OEM_VALID_MODULE_ID) { - p += sprintf(p, "module=%s ", - oem_type1_module_name(err->module_id)); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type1_field_module_id, - 0, oem_type1_module_name(err->module_id)); + if (err->val_bits & HISI_OEM_VALID_MODULE_ID && IN_RANGE(p, buf, end)) { + const char *str = oem_module_name(hisi_oem_type1_module, + err->module_id); + + p += snprintf(p, end - p, "module=%s ", str); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE1_FIELD_MODULE_ID, + 0, str); } - if (err->val_bits & HISI_OEM_VALID_SUB_MODULE_ID) { - char submodule_name[64]; + if (err->val_bits & HISI_OEM_VALID_SUB_MODULE_ID && + IN_RANGE(p, buf, end)) { + const char *str = oem_submodule_name(hisi_oem_type1_module, + err->module_id, + err->sub_module_id); - sprintf(submodule_name, "%s%d", - oem_type1_module_name(err->module_id), - err->sub_module_id); - p += sprintf(p, "submodule=%s ", submodule_name); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type1_field_sub_module_id, - 0, submodule_name); + p += snprintf(p, end - p, "submodule=%s ", str); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE1_FIELD_SUB_MODULE_ID, + 0, str); } - if (err->val_bits & HISI_OEM_VALID_ERR_SEVERITY) { - p += sprintf(p, "error_severity=%s ", + if (err->val_bits & HISI_OEM_VALID_ERR_SEVERITY && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "error_severity=%s ", err_severity(err->err_severity)); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type1_field_err_sev, + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE1_FIELD_ERR_SEV, 0, err_severity(err->err_severity)); } - p += sprintf(p, "]"); - trace_seq_printf(s, "\nHISI HIP08: OEM Type-1 Error\n"); + if (IN_RANGE(p, buf, end)) + p += snprintf(p, end - p, "]"); + trace_seq_printf(s, "%s\n", buf); +} + +static void decode_oem_type1_err_regs(struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + const struct hisi_oem_type1_err_sec *err) +{ + char buf[HISI_BUF_LEN]; + char *p = buf; + char *end = buf + HISI_BUF_LEN; - p = buf; trace_seq_printf(s, "Reg Dump:\n"); if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_0) { trace_seq_printf(s, "ERR_MISC0=0x%x\n", err->err_misc_0); - p += sprintf(p, "ERR_MISC0=0x%x ", err->err_misc_0); + p += snprintf(p, end - p, "ERR_MISC0=0x%x ", err->err_misc_0); } - if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_1) { + if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_1 && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_MISC1=0x%x\n", err->err_misc_1); - p += sprintf(p, "ERR_MISC1=0x%x ", err->err_misc_1); + p += snprintf(p, end - p, "ERR_MISC1=0x%x ", err->err_misc_1); } - if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_2) { + if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_2 && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_MISC2=0x%x\n", err->err_misc_2); - p += sprintf(p, "ERR_MISC2=0x%x ", err->err_misc_2); + p += snprintf(p, end - p, "ERR_MISC2=0x%x ", err->err_misc_2); } - if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_3) { + if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_3 && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_MISC3=0x%x\n", err->err_misc_3); - p += sprintf(p, "ERR_MISC3=0x%x ", err->err_misc_3); + p += snprintf(p, end - p, "ERR_MISC3=0x%x ", err->err_misc_3); } - if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_4) { + if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_MISC_4 && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_MISC4=0x%x\n", err->err_misc_4); - p += sprintf(p, "ERR_MISC4=0x%x ", err->err_misc_4); + p += snprintf(p, end - p, "ERR_MISC4=0x%x ", err->err_misc_4); } - if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_ADDR) { - trace_seq_printf(s, "ERR_ADDR=0x%p\n", (void *)err->err_addr); - p += sprintf(p, "ERR_ADDR=0x%p ", (void *)err->err_addr); + if (err->val_bits & HISI_OEM_TYPE1_VALID_ERR_ADDR && + IN_RANGE(p, buf, end)) { + trace_seq_printf(s, "ERR_ADDR=0x%llx\n", + (unsigned long long)err->err_addr); + p += snprintf(p, end - p, "ERR_ADDR=0x%llx ", + (unsigned long long)err->err_addr); } - *(--p) = '\0'; - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type1_field_regs_dump, 0, buf); + if (p > buf && p < end) { + p--; + *p = '\0'; + } + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE1_FIELD_REGS_DUMP, 0, buf); step_vendor_data_tab(dec_tab, "hip08_oem_type1_event_tab"); - - return 0; } -static int decode_hip08_oem_type2_error(struct ras_events *ras, +/* error data decoding functions */ +static int decode_hip08_oem_type1_error(struct ras_events *ras, struct ras_ns_dec_tab *dec_tab, struct trace_seq *s, struct ras_non_standard_event *event) { - const struct hisi_oem_type2_err_sec *err = - (struct hisi_oem_type2_err_sec *)event->error; - char buf[1024]; - char *p = buf; + const struct hisi_oem_type1_err_sec *err = + (struct hisi_oem_type1_err_sec*)event->error; if (err->val_bits == 0) { trace_seq_printf(s, "%s: no valid error information\n", @@ -558,136 +762,168 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras, #ifdef HAVE_SQLITE3 if (!dec_tab->stmt_dec_record) { if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, - &hip08_oem_type2_event_tab) != SQLITE_OK) { + &hip08_oem_type1_event_tab) + != SQLITE_OK) { trace_seq_printf(s, - "create sql hip08_oem_type2_event_tab fail\n"); + "create sql hip08_oem_type1_event_tab fail\n"); return -1; } } #endif - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type2_field_timestamp, + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE1_FIELD_TIMESTAMP, 0, event->timestamp); - p += sprintf(p, "[ "); - p += sprintf(p, "table_version=%d ", err->version); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type2_field_version, - err->version, NULL); - if (err->val_bits & HISI_OEM_VALID_SOC_ID) { - p += sprintf(p, "SOC_ID=%d ", err->soc_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type2_field_soc_id, + trace_seq_printf(s, "\nHISI HIP08: OEM Type-1 Error\n"); + decode_oem_type1_err_hdr(dec_tab, s, err); + decode_oem_type1_err_regs(dec_tab, s, err); + + return 0; +} + +static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + const struct hisi_oem_type2_err_sec *err) +{ + char buf[HISI_BUF_LEN]; + char *p = buf; + char *end = buf + HISI_BUF_LEN; + + p += snprintf(p, end - p, "[ table_version=%d ", err->version); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE2_FIELD_VERSION, err->version, NULL); + + if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE2_FIELD_SOC_ID, err->soc_id, NULL); } - if (err->val_bits & HISI_OEM_VALID_SOCKET_ID) { - p += sprintf(p, "socket_ID=%d ", err->socket_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type2_field_socket_id, + if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE2_FIELD_SOCKET_ID, err->socket_id, NULL); } - if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID) { - p += sprintf(p, "nimbus_ID=%d ", err->nimbus_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_oem_type2_field_nimbus_id, + if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_OEM_TYPE2_FIELD_NIMBUS_ID, err->nimbus_id, NULL); } - if (err->val_bits & HISI_OEM_VALID_MODULE_ID) { - p += sprintf(p, "module=%s ", - oem_type2_module_name(err->module_id)); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type2_field_module_id, - 0, oem_type2_module_name(err->module_id)); + if (err->val_bits & HISI_OEM_VALID_MODULE_ID && IN_RANGE(p, buf, end)) { + const char *str = oem_module_name(hisi_oem_type2_module, + err->module_id); + + p += snprintf(p, end - p, "module=%s ", str); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE2_FIELD_MODULE_ID, + 0, str); } - if (err->val_bits & HISI_OEM_VALID_SUB_MODULE_ID) { - const char *str = oem_type2_sub_module_name(err->module_id, - err->sub_module_id); + if (err->val_bits & HISI_OEM_VALID_SUB_MODULE_ID && + IN_RANGE(p, buf, end)) { + const char *str = oem_submodule_name(hisi_oem_type2_module, + err->module_id, + err->sub_module_id); - p += sprintf(p, "submodule=%s ", str); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type2_field_sub_module_id, + p += snprintf(p, end - p, "submodule=%s ", str); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE2_FIELD_SUB_MODULE_ID, 0, str); } - if (err->val_bits & HISI_OEM_VALID_ERR_SEVERITY) { - p += sprintf(p, "error_severity=%s ", + if (err->val_bits & HISI_OEM_VALID_ERR_SEVERITY && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "error_severity=%s ", err_severity(err->err_severity)); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type2_field_err_sev, + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE2_FIELD_ERR_SEV, 0, err_severity(err->err_severity)); } - p += sprintf(p, "]"); - trace_seq_printf(s, "\nHISI HIP08: OEM Type-2 Error\n"); + if (IN_RANGE(p, buf, end)) + p += snprintf(p, end - p, "]"); + trace_seq_printf(s, "%s\n", buf); +} + +static void decode_oem_type2_err_regs(struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + const struct hisi_oem_type2_err_sec *err) +{ + char buf[HISI_BUF_LEN]; + char *p = buf; + char *end = buf + HISI_BUF_LEN; - p = buf; trace_seq_printf(s, "Reg Dump:\n"); if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_FR) { trace_seq_printf(s, "ERR_FR_0=0x%x\n", err->err_fr_0); trace_seq_printf(s, "ERR_FR_1=0x%x\n", err->err_fr_1); - p += sprintf(p, "ERR_FR_0=0x%x ERR_FR_1=0x%x ", + p += snprintf(p, end - p, "ERR_FR_0=0x%x ERR_FR_1=0x%x ", err->err_fr_0, err->err_fr_1); } - if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_CTRL) { + if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_CTRL && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_CTRL_0=0x%x\n", err->err_ctrl_0); trace_seq_printf(s, "ERR_CTRL_1=0x%x\n", err->err_ctrl_1); - p += sprintf(p, "ERR_CTRL_0=0x%x ERR_CTRL_1=0x%x ", - err->err_ctrl_0, err->err_ctrl_1); + p += snprintf(p, end - p, "ERR_CTRL_0=0x%x ERR_CTRL_1=0x%x ", + err->err_ctrl_0, err->err_ctrl_1); } - if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_STATUS) { + if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_STATUS && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_STATUS_0=0x%x\n", err->err_status_0); trace_seq_printf(s, "ERR_STATUS_1=0x%x\n", err->err_status_1); - p += sprintf(p, "ERR_STATUS_0=0x%x ERR_STATUS_1=0x%x ", - err->err_status_0, err->err_status_1); + p += snprintf(p, end - p, "ERR_STATUS_0=0x%x ERR_STATUS_1=0x%x ", + err->err_status_0, err->err_status_1); } - if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_ADDR) { + if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_ADDR && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_ADDR_0=0x%x\n", err->err_addr_0); trace_seq_printf(s, "ERR_ADDR_1=0x%x\n", err->err_addr_1); - p += sprintf(p, "ERR_ADDR_0=0x%x ERR_ADDR_1=0x%x ", - err->err_addr_0, err->err_addr_1); + p += snprintf(p, end - p, "ERR_ADDR_0=0x%x ERR_ADDR_1=0x%x ", + err->err_addr_0, err->err_addr_1); } - if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_MISC_0) { + if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_MISC_0 && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_MISC0_0=0x%x\n", err->err_misc0_0); trace_seq_printf(s, "ERR_MISC0_1=0x%x\n", err->err_misc0_1); - p += sprintf(p, "ERR_MISC0_0=0x%x ERR_MISC0_1=0x%x ", - err->err_misc0_0, err->err_misc0_1); + p += snprintf(p, end - p, "ERR_MISC0_0=0x%x ERR_MISC0_1=0x%x ", + err->err_misc0_0, err->err_misc0_1); } - if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_MISC_1) { + if (err->val_bits & HISI_OEM_TYPE2_VALID_ERR_MISC_1 && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_MISC1_0=0x%x\n", err->err_misc1_0); trace_seq_printf(s, "ERR_MISC1_1=0x%x\n", err->err_misc1_1); - p += sprintf(p, "ERR_MISC1_0=0x%x ERR_MISC1_1=0x%x ", - err->err_misc1_0, err->err_misc1_1); + p += snprintf(p, end - p, "ERR_MISC1_0=0x%x ERR_MISC1_1=0x%x ", + err->err_misc1_0, err->err_misc1_1); } - *(--p) = '\0'; - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_oem_type2_field_regs_dump, 0, buf); + if (p > buf && p < end) { + p--; + *p = '\0'; + } + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE2_FIELD_REGS_DUMP, 0, buf); step_vendor_data_tab(dec_tab, "hip08_oem_type2_event_tab"); - - return 0; } -static int decode_hip08_pcie_local_error(struct ras_events *ras, - struct ras_ns_dec_tab *dec_tab, - struct trace_seq *s, - struct ras_non_standard_event *event) +static int decode_hip08_oem_type2_error(struct ras_events *ras, + struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + struct ras_non_standard_event *event) { - const struct hisi_pcie_local_err_sec *err = - (struct hisi_pcie_local_err_sec *)event->error; - char buf[1024]; - char *p = buf; - uint32_t i; + const struct hisi_oem_type2_err_sec *err = + (struct hisi_oem_type2_err_sec *)event->error; if (err->val_bits == 0) { trace_seq_printf(s, "%s: no valid error information\n", @@ -698,99 +934,170 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, #ifdef HAVE_SQLITE3 if (!dec_tab->stmt_dec_record) { if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, - &hip08_pcie_local_event_tab) != SQLITE_OK) { + &hip08_oem_type2_event_tab) != SQLITE_OK) { trace_seq_printf(s, - "create sql hip08_pcie_local_event_tab fail\n"); + "create sql hip08_oem_type2_event_tab fail\n"); return -1; } } #endif - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_pcie_local_field_timestamp, + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE2_FIELD_TIMESTAMP, 0, event->timestamp); - p += sprintf(p, "[ "); - p += sprintf(p, "table_version=%d ", err->version); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_pcie_local_field_version, + trace_seq_printf(s, "\nHISI HIP08: OEM Type-2 Error\n"); + decode_oem_type2_err_hdr(dec_tab, s, err); + decode_oem_type2_err_regs(dec_tab, s, err); + + return 0; +} + +static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + const struct hisi_pcie_local_err_sec *err) +{ + char buf[HISI_BUF_LEN]; + char *p = buf; + char *end = buf + HISI_BUF_LEN; + + p += snprintf(p, end - p, "[ table_version=%d ", err->version); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_PCIE_LOCAL_FIELD_VERSION, err->version, NULL); - if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOC_ID) { - p += sprintf(p, "SOC_ID=%d ", err->soc_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_pcie_local_field_soc_id, + + if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOC_ID && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_PCIE_LOCAL_FIELD_SOC_ID, err->soc_id, NULL); } - if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOCKET_ID) { - p += sprintf(p, "socket_ID=%d ", err->socket_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_pcie_local_field_socket_id, + if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOCKET_ID && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_PCIE_LOCAL_FIELD_SOCKET_ID, err->socket_id, NULL); } - if (err->val_bits & HISI_PCIE_LOCAL_VALID_NIMBUS_ID) { - p += sprintf(p, "nimbus_ID=%d ", err->nimbus_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_pcie_local_field_nimbus_id, + if (err->val_bits & HISI_PCIE_LOCAL_VALID_NIMBUS_ID && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_PCIE_LOCAL_FIELD_NIMBUS_ID, err->nimbus_id, NULL); } - if (err->val_bits & HISI_PCIE_LOCAL_VALID_SUB_MODULE_ID) { - p += sprintf(p, "submodule=%s ", - pcie_local_sub_module_name(err->sub_module_id)); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_pcie_local_field_sub_module_id, + if (err->val_bits & HISI_PCIE_LOCAL_VALID_SUB_MODULE_ID && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "submodule=%s ", + pcie_local_sub_module_name(err->sub_module_id)); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_PCIE_LOCAL_FIELD_SUB_MODULE_ID, 0, pcie_local_sub_module_name(err->sub_module_id)); } - if (err->val_bits & HISI_PCIE_LOCAL_VALID_CORE_ID) { - p += sprintf(p, "core_ID=core%d ", err->core_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_pcie_local_field_core_id, + if (err->val_bits & HISI_PCIE_LOCAL_VALID_CORE_ID && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "core_ID=core%d ", err->core_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_PCIE_LOCAL_FIELD_CORE_ID, err->core_id, NULL); } - if (err->val_bits & HISI_PCIE_LOCAL_VALID_PORT_ID) { - p += sprintf(p, "port_ID=port%d ", err->port_id); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_pcie_local_field_port_id, + if (err->val_bits & HISI_PCIE_LOCAL_VALID_PORT_ID && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "port_ID=port%d ", err->port_id); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_PCIE_LOCAL_FIELD_PORT_ID, err->port_id, NULL); } - if (err->val_bits & HISI_PCIE_LOCAL_VALID_ERR_SEVERITY) { - p += sprintf(p, "error_severity=%s ", - err_severity(err->err_severity)); - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_pcie_local_field_err_sev, + if (err->val_bits & HISI_PCIE_LOCAL_VALID_ERR_SEVERITY && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "error_severity=%s ", + err_severity(err->err_severity)); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_PCIE_LOCAL_FIELD_ERR_SEV, 0, err_severity(err->err_severity)); } - if (err->val_bits & HISI_PCIE_LOCAL_VALID_ERR_TYPE) { - p += sprintf(p, "error_type=0x%x ", err->err_type); - record_vendor_data(dec_tab, hisi_oem_data_type_int, - hip08_pcie_local_field_err_type, + if (err->val_bits & HISI_PCIE_LOCAL_VALID_ERR_TYPE && + IN_RANGE(p, buf, end)) { + p += snprintf(p, end - p, "error_type=0x%x ", err->err_type); + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, + HIP08_PCIE_LOCAL_FIELD_ERR_TYPE, err->err_type, NULL); } - p += sprintf(p, "]"); - trace_seq_printf(s, "\nHISI HIP08: PCIe local error\n"); + if (IN_RANGE(p, buf, end)) + p += snprintf(p, end - p, "]"); + trace_seq_printf(s, "%s\n", buf); +} + +static void decode_pcie_local_err_regs(struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + const struct hisi_pcie_local_err_sec *err) +{ + char buf[HISI_BUF_LEN]; + char *p = buf; + char *end = buf + HISI_BUF_LEN; + uint32_t i; - p = buf; trace_seq_printf(s, "Reg Dump:\n"); - for (i = 0; i < 33; i++) { - if (err->val_bits & BIT(HISI_PCIE_LOCAL_VALID_ERR_MISC + i)) { + for (i = 0; i < HISI_PCIE_LOCAL_ERR_MISC_MAX; i++) { + if (err->val_bits & BIT(HISI_PCIE_LOCAL_VALID_ERR_MISC + i) && + IN_RANGE(p, buf, end)) { trace_seq_printf(s, "ERR_MISC_%d=0x%x\n", i, err->err_misc[i]); - p += sprintf(p, "ERR_MISC_%d=0x%x ", i, err->err_misc[i]); + p += snprintf(p, end - p, "ERR_MISC_%d=0x%x ", + i, err->err_misc[i]); } } - *(--p) = '\0'; - record_vendor_data(dec_tab, hisi_oem_data_type_text, - hip08_pcie_local_field_regs_dump, 0, buf); + if (p > buf && p < end) { + p--; + *p = '\0'; + } + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_PCIE_LOCAL_FIELD_REGS_DUMP, 0, buf); step_vendor_data_tab(dec_tab, "hip08_pcie_local_event_tab"); +} + +static int decode_hip08_pcie_local_error(struct ras_events *ras, + struct ras_ns_dec_tab *dec_tab, + struct trace_seq *s, + struct ras_non_standard_event *event) +{ + const struct hisi_pcie_local_err_sec *err = + (struct hisi_pcie_local_err_sec *)event->error; + + if (err->val_bits == 0) { + trace_seq_printf(s, "%s: no valid error information\n", + __func__); + return -1; + } + +#ifdef HAVE_SQLITE3 + if (!dec_tab->stmt_dec_record) { + if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, + &hip08_pcie_local_event_tab) != SQLITE_OK) { + trace_seq_printf(s, + "create sql hip08_pcie_local_event_tab fail\n"); + return -1; + } + } +#endif + record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, + HIP08_PCIE_LOCAL_FIELD_TIMESTAMP, + 0, event->timestamp); + + trace_seq_printf(s, "\nHISI HIP08: PCIe local error\n"); + decode_pcie_local_err_hdr(dec_tab, s, err); + decode_pcie_local_err_regs(dec_tab, s, err); return 0; } @@ -811,8 +1118,7 @@ struct ras_ns_dec_tab hip08_ns_oem_tab[] = { { /* sentinel */ } }; -__attribute__((constructor)) -static void hip08_init(void) +static void __attribute__((constructor)) hip08_init(void) { register_ns_dec_tab(hip08_ns_oem_tab); } diff --git a/ras-events.c b/ras-events.c index 511c93d..a99fd29 100644 --- a/ras-events.c +++ b/ras-events.c @@ -39,6 +39,7 @@ #include "ras-diskerror-handler.h" #include "ras-record.h" #include "ras-logger.h" +#include "ras-page-isolation.h" /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never @@ -409,8 +410,10 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, } log(TERM, LOG_INFO, "Listening to events for cpus 0 to %d\n", n_cpus - 1); - if (pdata[0].ras->record_events) - ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras); + if (pdata[0].ras->record_events) { + if (ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras)) + goto error; + } do { ready = poll(fds, (n_cpus + 1), -1); @@ -494,10 +497,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, "Old kernel detected. Stop listening and fall back to pthread way.\n"); cleanup: - if (pdata[0].ras->record_events) { - unregister_ns_dec_tab(); + if (pdata[0].ras->record_events) ras_mc_event_closedb(pdata[0].cpu, pdata[0].ras); - } error: kbuffer_free(kbuf); @@ -584,15 +585,20 @@ static void *handle_ras_events_cpu(void *priv) } log(TERM, LOG_INFO, "Listening to events on cpu %d\n", pdata->cpu); - if (pdata->ras->record_events) - ras_mc_event_opendb(pdata->cpu, pdata->ras); + if (pdata->ras->record_events) { + if (ras_mc_event_opendb(pdata->cpu, pdata->ras)) { + log(TERM, LOG_ERR, "Can't open database\n"); + close(fd); + kbuffer_free(kbuf); + free(page); + return 0; + } + } read_ras_event(fd, pdata, kbuf, page); - if (pdata->ras->record_events) { - unregister_ns_dec_tab(); + if (pdata->ras->record_events) ras_mc_event_closedb(pdata->cpu, pdata->ras); - } close(fd); kbuffer_free(kbuf); @@ -798,6 +804,11 @@ int handle_ras_events(int record_events) ras->page_size = page_size; ras->record_events = record_events; +#ifdef HAVE_MEMORY_CE_PFA + /* FIXME: enable memory isolation unconditionally */ + ras_page_account_init(); +#endif + rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", ras_mc_event_handler, NULL, MC_EVENT); if (!rc) diff --git a/ras-mc-handler.c b/ras-mc-handler.c index deb7e05..42b05cd 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c @@ -23,6 +23,7 @@ #include "ras-mc-handler.h" #include "ras-record.h" #include "ras-logger.h" +#include "ras-page-isolation.h" #include "ras-report.h" int ras_mc_event_handler(struct trace_seq *s, @@ -183,6 +184,12 @@ int ras_mc_event_handler(struct trace_seq *s, ras_store_mc_event(ras, &ev); +#ifdef HAVE_MEMORY_CE_PFA + /* Account page corrected errors */ + if (!strcmp(ev.error_type, "Corrected")) + ras_record_page_error(ev.address, ev.error_count, now); +#endif + #ifdef HAVE_ABRT_REPORT /* Report event to ABRT */ ras_report_mc_event(ras, &ev); diff --git a/ras-mce-handler.c b/ras-mce-handler.c index fd3ef3b..016acae 100644 --- a/ras-mce-handler.c +++ b/ras-mce-handler.c @@ -55,7 +55,7 @@ static char *cputype_name[] = { [CPU_KNIGHTS_LANDING] = "Knights Landing", [CPU_KNIGHTS_MILL] = "Knights Mill", [CPU_SKYLAKE_XEON] = "Skylake server", - [CPU_NAPLES] = "AMD Family 17h Zen1", + [CPU_AMD_SMCA] = "AMD Scalable MCA", [CPU_DHYANA] = "Hygon Family 18h Moksha" }; @@ -192,8 +192,10 @@ static int detect_cpu(struct ras_events *ras) if (!strcmp(mce->vendor, "AuthenticAMD")) { if (mce->family == 15) mce->cputype = CPU_K8; - if (mce->family == 23) - mce->cputype = CPU_NAPLES; + if (strstr(mce->processor_flags, "smca")) { + mce->cputype = CPU_AMD_SMCA; + goto ret; + } if (mce->family > 23) { log(ALL, LOG_INFO, "Can't parse MCE for this AMD CPU yet %d\n", @@ -441,7 +443,7 @@ int ras_mce_event_handler(struct trace_seq *s, case CPU_K8: rc = parse_amd_k8_event(ras, &e); break; - case CPU_NAPLES: + case CPU_AMD_SMCA: case CPU_DHYANA: rc = parse_amd_smca_event(ras, &e); break; diff --git a/ras-mce-handler.h b/ras-mce-handler.h index 4d615b4..ec9a076 100644 --- a/ras-mce-handler.h +++ b/ras-mce-handler.h @@ -47,7 +47,7 @@ enum cputype { CPU_KNIGHTS_LANDING, CPU_KNIGHTS_MILL, CPU_SKYLAKE_XEON, - CPU_NAPLES, + CPU_AMD_SMCA, CPU_DHYANA, }; diff --git a/ras-page-isolation.c b/ras-page-isolation.c new file mode 100644 index 0000000..50e4406 --- /dev/null +++ b/ras-page-isolation.c @@ -0,0 +1,332 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#include +#include +#include +#include +#include +#include "ras-logger.h" +#include "ras-page-isolation.h" + +#define PARSED_ENV_LEN 50 +static const struct config threshold_units[] = { + { "m", 1000 }, + { "k", 1000 }, + { "", 1 }, + {} +}; + +static const struct config cycle_units[] = { + { "d", 24 }, + { "h", 60 }, + { "m", 60 }, + { "s", 1 }, + {} +}; + +static struct isolation threshold = { + .name = "PAGE_CE_THRESHOLD", + .units = threshold_units, + .env = "50", + .unit = "", +}; + +static struct isolation cycle = { + .name = "PAGE_CE_REFRESH_CYCLE", + .units = cycle_units, + .env = "24h", + .unit = "h", +}; + +static const char *kernel_offline[] = { + [OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page", + [OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page", + [OFFLINE_SOFT_THEN_HARD] = "/sys/devices/system/memory/soft_offline_page", +}; + +static const struct config offline_choice[] = { + { "off", OFFLINE_OFF }, + { "account", OFFLINE_ACCOUNT }, + { "soft", OFFLINE_SOFT }, + { "hard", OFFLINE_HARD }, + { "soft-then-hard", OFFLINE_SOFT_THEN_HARD }, + {} +}; + +static const char *page_state[] = { + [PAGE_ONLINE] = "online", + [PAGE_OFFLINE] = "offlined", + [PAGE_OFFLINE_FAILED] = "offline-failed", +}; + +static enum otype offline = OFFLINE_SOFT; +static struct rb_root page_records; + +static void page_offline_init(void) +{ + const char *env = "PAGE_CE_ACTION"; + char *choice = getenv(env); + const struct config *c = NULL; + int matched = 0; + + if (choice) { + for (c = offline_choice; c->name; c++) { + if (!strcasecmp(choice, c->name)) { + offline = c->val; + matched = 1; + break; + } + } + } + + if (!matched) + log(TERM, LOG_INFO, "Improper %s, set to default soft\n", env); + + if (offline > OFFLINE_ACCOUNT && access(kernel_offline[offline], W_OK)) { + log(TERM, LOG_INFO, "Kernel does not support page offline interface\n"); + offline = OFFLINE_ACCOUNT; + } + + log(TERM, LOG_INFO, "Page offline choice on Corrected Errors is %s\n", + offline_choice[offline].name); +} + +static void parse_isolation_env(struct isolation *config) +{ + char *env = getenv(config->name); + char *unit = NULL; + const struct config *units = NULL; + int i, no_unit; + int valid = 0; + int unit_matched = 0; + unsigned long value, tmp; + + /* check if env is vaild */ + if (env && strlen(env)) { + /* All the character before unit must be digit */ + for (i = 0; i < strlen(env) - 1; i++) { + if (!isdigit(env[i])) + goto parse; + } + if (sscanf(env, "%lu", &value) < 1 || !value) + goto parse; + /* check if the unit is vaild */ + unit = env + strlen(env) - 1; + /* no unit, all the character are value character */ + if (isdigit(*unit)) { + valid = 1; + no_unit = 1; + goto parse; + } + for (units = config->units; units->name; units++) { + /* value character and unit character are both valid */ + if (!strcasecmp(unit, units->name)) { + valid = 1; + no_unit = 0; + break; + } + } + } + +parse: + /* if invalid, use default env */ + if (valid) { + config->env = env; + if (!no_unit) + config->unit = unit; + } else { + log(TERM, LOG_INFO, "Improper %s, set to default %s.\n", + config->name, config->env); + } + + /* if env value string is greater than ulong_max, truncate the last digit */ + sscanf(config->env, "%lu", &value); + for (units = config->units; units->name; units++) { + if (!strcasecmp(config->unit, units->name)) + unit_matched = 1; + if (unit_matched) { + tmp = value; + value *= units->val; + if (tmp != 0 && value / tmp != units->val) + config->overflow = true; + } + } + config->val = value; + /* In order to output value and unit perfectly */ + config->unit = no_unit ? config->unit : ""; +} + +static void parse_env_string(struct isolation *config, char *str) +{ + int i; + + if (config->overflow) { + /* when overflow, use basic unit */ + for (i = 0; config->units[i].name; i++) ; + sprintf(str, "%lu%s", config->val, config->units[i-1].name); + log(TERM, LOG_INFO, "%s is set overflow(%s), truncate it\n", + config->name, config->env); + } else { + sprintf(str, "%s%s", config->env, config->unit); + } +} + +static void page_isolation_init(void) +{ + char threshold_string[PARSED_ENV_LEN]; + char cycle_string[PARSED_ENV_LEN]; + /** + * It's unnecessary to parse threshold configuration when offline + * choice is off. + */ + if (offline == OFFLINE_OFF) + return; + + parse_isolation_env(&threshold); + parse_isolation_env(&cycle); + parse_env_string(&threshold, threshold_string); + parse_env_string(&cycle, cycle_string); + log(TERM, LOG_INFO, "Threshold of memory Corrected Errors is %s / %s\n", + threshold_string, cycle_string); +} + +void ras_page_account_init(void) +{ + page_offline_init(); + page_isolation_init(); +} + +static int do_page_offline(unsigned long long addr, enum otype type) +{ + FILE *offline_file; + int err; + + offline_file = fopen(kernel_offline[type], "w"); + if (!offline_file) + return -1; + + fprintf(offline_file, "%#llx", addr); + err = ferror(offline_file) ? -1 : 0; + fclose(offline_file); + + return err; +} + +static void page_offline(struct page_record *pr) +{ + unsigned long long addr = pr->addr; + int ret; + + /* Offlining page is not required */ + if (offline <= OFFLINE_ACCOUNT) + return; + + /* Ignore offlined pages */ + if (pr->offlined != PAGE_ONLINE) + return; + + /* Time to silence this noisy page */ + if (offline == OFFLINE_SOFT_THEN_HARD) { + ret = do_page_offline(addr, OFFLINE_SOFT); + if (ret < 0) + ret = do_page_offline(addr, OFFLINE_HARD); + } else { + ret = do_page_offline(addr, offline); + } + + pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; + + log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", + addr, page_state[pr->offlined]); +} + +static void page_record(struct page_record *pr, unsigned count, time_t time) +{ + unsigned long period = time - pr->start; + unsigned long tolerate; + + if (period >= cycle.val) { + /** + * Since we don't refresh automatically, it is possible that the period + * between two occurences will be longer than the pre-configured refresh cycle. + * In this case, we tolerate the frequency of the whole period up to + * the pre-configured threshold. + */ + tolerate = (period / (double)cycle.val) * threshold.val; + pr->count -= (tolerate > pr->count) ? pr->count : tolerate; + pr->start = time; + pr->excess = 0; + } + + pr->count += count; + if (pr->count >= threshold.val) { + log(TERM, LOG_INFO, "Corrected Errors at %#llx exceeded threshold\n", pr->addr); + + /** + * Backup ce count of current cycle to enable next round, which actually + * should never happen if we can disable overflow completely in the same + * time unit (but sadly we can't). + */ + pr->excess += pr->count; + pr->count = 0; + page_offline(pr); + } +} + +static struct page_record *page_lookup_insert(unsigned long long addr) +{ + struct rb_node **entry = &page_records.rb_node; + struct rb_node *parent = NULL; + struct page_record *pr = NULL, *find = NULL; + + while (*entry) { + parent = *entry; + pr = rb_entry(parent, struct page_record, entry); + if (addr == pr->addr) { + return pr; + } else if (addr < pr->addr) { + entry = &(*entry)->rb_left; + } else { + entry = &(*entry)->rb_right; + } + } + + find = calloc(1, sizeof(struct page_record)); + if (!find) { + log(TERM, LOG_ERR, "No memory for page records\n"); + return NULL; + } + + find->addr = addr; + rb_link_node(&find->entry, parent, entry); + rb_insert_color(&find->entry, &page_records); + + return find; +} + +void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) +{ + struct page_record *pr = NULL; + + if (offline == OFFLINE_OFF) + return; + + pr = page_lookup_insert(addr & PAGE_MASK); + if (pr) { + if (!pr->start) + pr->start = time; + page_record(pr, count, time); + } +} diff --git a/ras-page-isolation.h b/ras-page-isolation.h new file mode 100644 index 0000000..3d03cef --- /dev/null +++ b/ras-page-isolation.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#ifndef __RAS_PAGE_ISOLATION_H +#define __RAS_PAGE_ISOLATION_H + +#include +#include +#include "rbtree.h" + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1 << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +struct config { + char *name; + unsigned long val; +}; + +enum otype { + OFFLINE_OFF, + OFFLINE_ACCOUNT, + OFFLINE_SOFT, + OFFLINE_HARD, + OFFLINE_SOFT_THEN_HARD, +}; + +enum pstate { + PAGE_ONLINE, + PAGE_OFFLINE, + PAGE_OFFLINE_FAILED, +}; + +struct page_record { + struct rb_node entry; + unsigned long long addr; + time_t start; + enum pstate offlined; + unsigned long count; + unsigned long excess; +}; + +struct isolation { + char *name; + char *env; + const struct config *units; + unsigned long val; + bool overflow; + char *unit; +}; + +void ras_page_account_init(void); +void ras_record_page_error(unsigned long long addr, unsigned count, time_t time); + +#endif diff --git a/ras-record.c b/ras-record.c index 318bace..549c494 100644 --- a/ras-record.c +++ b/ras-record.c @@ -713,8 +713,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) log(TERM, LOG_ERR, "cpu %u: Failed to initialize sqlite: error = %d\n", cpu, rc); - free(priv); - return -1; + goto error; } do { @@ -730,66 +729,93 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) log(TERM, LOG_ERR, "cpu %u: Failed to connect to %s: error = %d\n", cpu, SQLITE_RAS_DB, rc); - free(priv); - return -1; + goto error; } priv->db = db; rc = ras_mc_create_table(priv, &mc_event_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab); + if (rc != SQLITE_OK) + goto error; + } #ifdef HAVE_AER rc = ras_mc_create_table(priv, &aer_event_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_aer_event, &aer_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif #ifdef HAVE_EXTLOG rc = ras_mc_create_table(priv, &extlog_event_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_extlog_record, &extlog_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif #ifdef HAVE_MCE rc = ras_mc_create_table(priv, &mce_record_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_mce_record, &mce_record_tab); + if (rc != SQLITE_OK) + goto error; + } #endif #ifdef HAVE_NON_STANDARD rc = ras_mc_create_table(priv, &non_standard_event_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_non_standard_record, &non_standard_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif #ifdef HAVE_ARM rc = ras_mc_create_table(priv, &arm_event_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record, &arm_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif #ifdef HAVE_DEVLINK rc = ras_mc_create_table(priv, &devlink_event_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_devlink_event, &devlink_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif #ifdef HAVE_DISKERROR rc = ras_mc_create_table(priv, &diskerror_event_tab); - if (rc == SQLITE_OK) + if (rc == SQLITE_OK) { rc = ras_mc_prepare_stmt(priv, &priv->stmt_diskerror_event, &diskerror_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif - ras->db_priv = priv; + ras->db_priv = priv; return 0; + +error: + free(priv); + return -1; } int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) diff --git a/ras-record.h b/ras-record.h index 3fbdc5e..cc217a9 100644 --- a/ras-record.h +++ b/ras-record.h @@ -27,7 +27,7 @@ extern long user_hz; -struct ras_events *ras; +struct ras_events; struct ras_mc_event { char timestamp[64]; diff --git a/rbtree.c b/rbtree.c new file mode 100644 index 0000000..d9b1bd4 --- /dev/null +++ b/rbtree.c @@ -0,0 +1,384 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + (C) 2002 David Woodhouse + Taken from the Linux 2.6.30 source with some minor modificatons. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c +*/ + +#include "rbtree.h" + +static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *right = node->rb_right; + struct rb_node *parent = rb_parent(node); + + if ((node->rb_right = right->rb_left)) + rb_set_parent(right->rb_left, node); + right->rb_left = node; + + rb_set_parent(right, parent); + + if (parent) + { + if (node == parent->rb_left) + parent->rb_left = right; + else + parent->rb_right = right; + } + else + root->rb_node = right; + rb_set_parent(node, right); +} + +static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *left = node->rb_left; + struct rb_node *parent = rb_parent(node); + + if ((node->rb_left = left->rb_right)) + rb_set_parent(left->rb_right, node); + left->rb_right = node; + + rb_set_parent(left, parent); + + if (parent) + { + if (node == parent->rb_right) + parent->rb_right = left; + else + parent->rb_left = left; + } + else + root->rb_node = left; + rb_set_parent(node, left); +} + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *parent, *gparent; + + while ((parent = rb_parent(node)) && rb_is_red(parent)) + { + gparent = rb_parent(parent); + + if (parent == gparent->rb_left) + { + { + register struct rb_node *uncle = gparent->rb_right; + if (uncle && rb_is_red(uncle)) + { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_right == node) + { + struct rb_node *tmp; + __rb_rotate_left(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_right(gparent, root); + } else { + { + struct rb_node *uncle = gparent->rb_left; + if (uncle && rb_is_red(uncle)) + { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_left == node) + { + struct rb_node *tmp; + __rb_rotate_right(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_left(gparent, root); + } + } + + rb_set_black(root->rb_node); +} + +static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, + struct rb_root *root) +{ + struct rb_node *other; + + while ((!node || rb_is_black(node)) && node != root->rb_node) + { + if (parent->rb_left == node) + { + other = parent->rb_right; + if (rb_is_red(other)) + { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_left(parent, root); + other = parent->rb_right; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) + { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } + else + { + if (!other->rb_right || rb_is_black(other->rb_right)) + { + rb_set_black(other->rb_left); + rb_set_red(other); + __rb_rotate_right(other, root); + other = parent->rb_right; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_right); + __rb_rotate_left(parent, root); + node = root->rb_node; + break; + } + } + else + { + other = parent->rb_left; + if (rb_is_red(other)) + { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_right(parent, root); + other = parent->rb_left; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) + { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } + else + { + if (!other->rb_left || rb_is_black(other->rb_left)) + { + rb_set_black(other->rb_right); + rb_set_red(other); + __rb_rotate_left(other, root); + other = parent->rb_left; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_left); + __rb_rotate_right(parent, root); + node = root->rb_node; + break; + } + } + } + if (node) + rb_set_black(node); +} + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *child, *parent; + int color; + + if (!node->rb_left) + child = node->rb_right; + else if (!node->rb_right) + child = node->rb_left; + else + { + struct rb_node *old = node, *left; + + node = node->rb_right; + while ((left = node->rb_left) != NULL) + node = left; + child = node->rb_right; + parent = rb_parent(node); + color = rb_color(node); + + if (child) + rb_set_parent(child, parent); + if (parent == old) { + parent->rb_right = child; + parent = node; + } else + parent->rb_left = child; + + node->rb_parent_color = old->rb_parent_color; + node->rb_right = old->rb_right; + node->rb_left = old->rb_left; + + if (rb_parent(old)) + { + if (rb_parent(old)->rb_left == old) + rb_parent(old)->rb_left = node; + else + rb_parent(old)->rb_right = node; + } else + root->rb_node = node; + + rb_set_parent(old->rb_left, node); + if (old->rb_right) + rb_set_parent(old->rb_right, node); + goto color; + } + + parent = rb_parent(node); + color = rb_color(node); + + if (child) + rb_set_parent(child, parent); + if (parent) + { + if (parent->rb_left == node) + parent->rb_left = child; + else + parent->rb_right = child; + } + else + root->rb_node = child; + + color: + if (color == RB_BLACK) + __rb_erase_color(child, parent, root); +} + +/* + * This function returns the first node (in sort order) of the tree. + */ +struct rb_node *rb_first(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_left) + n = n->rb_left; + return n; +} + +struct rb_node *rb_last(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} + +struct rb_node *rb_next(const struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + /* If we have a right-hand child, go down and then left as far + as we can. */ + if (node->rb_right) { + node = node->rb_right; + while (node->rb_left) + node=node->rb_left; + return (struct rb_node *)node; + } + + /* No right-hand children. Everything down and left is + smaller than us, so any 'next' node must be in the general + direction of our parent. Go up the tree; any time the + ancestor is a right-hand child of its parent, keep going + up. First time it's a left-hand child of its parent, said + parent is our 'next' node. */ + while ((parent = rb_parent(node)) && node == parent->rb_right) + node = parent; + + return parent; +} + +struct rb_node *rb_prev(const struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + /* If we have a left-hand child, go down and then right as far + as we can. */ + if (node->rb_left) { + node = node->rb_left; + while (node->rb_right) + node=node->rb_right; + return (struct rb_node *)node; + } + + /* No left-hand children. Go up till we find an ancestor which + is a right-hand child of its parent */ + while ((parent = rb_parent(node)) && node == parent->rb_left) + node = parent; + + return parent; +} + +void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); + + /* Set the surrounding nodes to point to the replacement */ + if (parent) { + if (victim == parent->rb_left) + parent->rb_left = new; + else + parent->rb_right = new; + } else { + root->rb_node = new; + } + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; +} diff --git a/rbtree.h b/rbtree.h new file mode 100644 index 0000000..a8a0459 --- /dev/null +++ b/rbtree.h @@ -0,0 +1,165 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + Taken from the Linux 2.6.30 source. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/include/linux/rbtree.h + + To use rbtrees you'll have to implement your own insert and search cores. + This will avoid us to use callbacks and to drop drammatically performances. + I know it's not the cleaner way, but in C (not in C++) to get + performances and genericity... + + Some example of insert and search follows here. The search is a plain + normal search over an ordered tree. The insert instead must be implemented + int two steps: as first thing the code must insert the element in + order as a red leaf in the tree, then the support library function + rb_insert_color() must be called. Such function will do the + not trivial work to rebalance the rbtree if necessary. + +----------------------------------------------------------------------- +static inline struct page * rb_search_page_cache(struct inode * inode, + unsigned long offset) +{ + struct rb_node * n = inode->i_rb_page_cache.rb_node; + struct page * page; + + while (n) + { + page = rb_entry(n, struct page, rb_page_cache); + + if (offset < page->offset) + n = n->rb_left; + else if (offset > page->offset) + n = n->rb_right; + else + return page; + } + return NULL; +} + +static inline struct page * __rb_insert_page_cache(struct inode * inode, + unsigned long offset, + struct rb_node * node) +{ + struct rb_node ** p = &inode->i_rb_page_cache.rb_node; + struct rb_node * parent = NULL; + struct page * page; + + while (*p) + { + parent = *p; + page = rb_entry(parent, struct page, rb_page_cache); + + if (offset < page->offset) + p = &(*p)->rb_left; + else if (offset > page->offset) + p = &(*p)->rb_right; + else + return page; + } + + rb_link_node(node, parent, p); + + return NULL; +} + +static inline struct page * rb_insert_page_cache(struct inode * inode, + unsigned long offset, + struct rb_node * node) +{ + struct page * ret; + if ((ret = __rb_insert_page_cache(inode, offset, node))) + goto out; + rb_insert_color(node, &inode->i_rb_page_cache); + out: + return ret; +} +----------------------------------------------------------------------- +*/ + +#ifndef _LINUX_RBTREE_H +#define _LINUX_RBTREE_H + +#include + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +struct rb_node +{ + unsigned long rb_parent_color; +#define RB_RED 0 +#define RB_BLACK 1 + struct rb_node *rb_right; + struct rb_node *rb_left; +} __attribute__((aligned(sizeof(long)))); + /* The alignment might seem pointless, but allegedly CRIS needs it */ + +struct rb_root +{ + struct rb_node *rb_node; +}; + + +#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) +#define rb_color(r) ((r)->rb_parent_color & 1) +#define rb_is_red(r) (!rb_color(r)) +#define rb_is_black(r) rb_color(r) +#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) +#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; +} +static inline void rb_set_color(struct rb_node *rb, int color) +{ + rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; +} + +#define RB_ROOT (struct rb_root) { NULL, } +#define rb_entry(ptr, type, member) container_of(ptr, type, member) + +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) + +extern void rb_insert_color(struct rb_node *, struct rb_root *); +extern void rb_erase(struct rb_node *, struct rb_root *); + +/* Find logical next and previous nodes in a tree */ +extern struct rb_node *rb_next(const struct rb_node *); +extern struct rb_node *rb_prev(const struct rb_node *); +extern struct rb_node *rb_first(const struct rb_root *); +extern struct rb_node *rb_last(const struct rb_root *); + +/* Fast replacement of a single node without remove/rebalance/add/rebalance */ +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root); + +static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, + struct rb_node ** rb_link) +{ + node->rb_parent_color = (unsigned long )parent; + node->rb_left = node->rb_right = NULL; + + *rb_link = node; +} + +#endif /* _LINUX_RBTREE_H */ diff --git a/util/Makefile.in b/util/Makefile.in index 443c538..53e7b82 100644 --- a/util/Makefile.in +++ b/util/Makefile.in @@ -238,6 +238,7 @@ WITH_DISKERROR = @WITH_DISKERROR@ WITH_EXTLOG = @WITH_EXTLOG@ WITH_HISI_NS_DECODE = @WITH_HISI_NS_DECODE@ WITH_MCE = @WITH_MCE@ +WITH_MEMORY_CE_PFA = @WITH_MEMORY_CE_PFA@ WITH_NON_STANDARD = @WITH_NON_STANDARD@ WITH_SQLITE3 = @WITH_SQLITE3@ abs_builddir = @abs_builddir@