diff --git a/README.md b/README.md index ce98f9e17..ed5a177d7 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,44 @@ To configure this script to run using cron, do: * To display all scheduled cron jobs for the current user, you can use `crontab -l` * You can halt the cron job by opening up the crontab file and commenting out the job with `#` or delete the crontab expression from the crontab file +# Perform History Cleanup + +You can perform a manual history cleanup operation which will prune all repetitive entries, determined using comparisons from the history pruning algorithm, present in the fhir_endpoints_info_history table. It will also prune the corresponding entries from the validations and validation_results table. It is a two-step process. + +Step 1: Collect the identifiers of repetitive entries + +Change directory to the /scripts inside lantern-back-end and run: + + ```bash + ./duplicate_info_history_check.sh + ``` + +This will start capturing the identifiers of repetitive entries in the fhir_endpoints_info_history table and store it in duplicateInfoHistoryIds.csv file inside the /home directory of the lantern-back-end_endpoint_manager_1 container. + +To retrieve the csv file, change directory to /lantern-back-end and run: + + ```bash + docker cp lantern-back-end_endpoint_manager_1:/home/duplicateInfoHistoryIds.csv . + ``` + +Step 2: Perform the history cleanup + +Change directory to the /scripts inside lantern-back-end and run: + + ```bash + ./history_cleanup.sh + ``` + +This will start the deletion of data from fhir_endpoints_info_history table using the captured identifiers of repetitive entries. + + ```bash + ./validations_cleanup.sh + ``` + +This will start the deletion of data from validations and validation_results tables using the captured identifiers of repetitive entries. + +Note: Ensure that the duplicateInfoHistoryIds.csv file is present in /lantern-back-end before executing the above scripts. + # Running Lantern Services Individually ## Internal Services diff --git a/endpointmanager/cmd/historycleanup/main.go b/endpointmanager/cmd/historycleanup/main.go new file mode 100644 index 000000000..807be20aa --- /dev/null +++ b/endpointmanager/cmd/historycleanup/main.go @@ -0,0 +1,25 @@ +package main + +import ( + "context" + + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/config" + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/endpointmanager/postgresql" + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers" + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/historycleanup" + log "github.com/sirupsen/logrus" + "github.com/spf13/viper" +) + +func main() { + err := config.SetupConfig() + helpers.FailOnError("", err) + + store, err := postgresql.NewStore(viper.GetString("dbhost"), viper.GetInt("dbport"), viper.GetString("dbuser"), viper.GetString("dbpassword"), viper.GetString("dbname"), viper.GetString("dbsslmode")) + helpers.FailOnError("", err) + log.Info("Successfully connected to DB!") + + ctx := context.Background() + + historycleanup.GetInfoHistoryDuplicateData(ctx, store, true) +} diff --git a/endpointmanager/pkg/endpointmanager/postgresql/historycleanupstore.go b/endpointmanager/pkg/endpointmanager/postgresql/historycleanupstore.go new file mode 100644 index 000000000..62954ff59 --- /dev/null +++ b/endpointmanager/pkg/endpointmanager/postgresql/historycleanupstore.go @@ -0,0 +1,57 @@ +package postgresql + +import ( + "context" + "database/sql" + + log "github.com/sirupsen/logrus" +) + +var duplicateInfoHistoryStatement *sql.Stmt +var distinctURLStatement *sql.Stmt + +// GetDistinctURLs gets a list of ordered distinct URLs from the history table +func (s *Store) GetDistinctURLsFromHistory(ctx context.Context) (*sql.Rows, error) { + + log.Info("Inside GetDistinctURLsFromHistory") + + var err error + + distinctURLStatement, err = s.DB.Prepare(` + select DISTINCT(url) FROM fhir_endpoints_info_history + WHERE (operation='U' OR operation='I') + ORDER BY url;`) + + if err != nil { + return nil, err + } + + var rows *sql.Rows + + rows, err = distinctURLStatement.QueryContext(ctx) + + return rows, err +} + +// PruningGetInfoHistoryUsingURL gets info history entries matching the given URL for pruning +func (s *Store) PruningGetInfoHistoryUsingURL(ctx context.Context, queryInterval bool, url string) (*sql.Rows, error) { + + log.Info("Inside PruningGetInfoHistoryUsingURL") + + var err error + + duplicateInfoHistoryStatement, err = s.DB.Prepare(` + SELECT operation, url, capability_statement, entered_at, tls_version, mime_types, smart_response, validation_result_id, requested_fhir_version FROM fhir_endpoints_info_history + WHERE (operation='U' OR operation='I') AND url = $1 + ORDER BY entered_at ASC;`) + + if err != nil { + return nil, err + } + + var rows *sql.Rows + + rows, err = duplicateInfoHistoryStatement.QueryContext(ctx, url) + + return rows, err +} diff --git a/endpointmanager/pkg/historycleanup/historycleanup.go b/endpointmanager/pkg/historycleanup/historycleanup.go new file mode 100644 index 000000000..f88857a4f --- /dev/null +++ b/endpointmanager/pkg/historycleanup/historycleanup.go @@ -0,0 +1,229 @@ +package historycleanup + +import ( + "context" + "database/sql" + "encoding/csv" + "encoding/json" + "os" + "strconv" + + "github.com/lib/pq" + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/capabilityparser" + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/endpointmanager/postgresql" + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers" + "github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/smartparser" + log "github.com/sirupsen/logrus" +) + +// GetInfoHistoryDuplicateData checks info table and stores the identifiers of any repetitive entries in CSV files +func GetInfoHistoryDuplicateData(ctx context.Context, store *postgresql.Store, queryInterval bool) { + + historyRowCount := 1 + historyDuplicateRowCount := 1 + + var rows *sql.Rows + var distinctURLrows *sql.Rows + var err error + var existingDistinctURLs []string + var URLCaptured bool + + // Get distinct URLs from the history table + distinctURLrows, err = store.GetDistinctURLsFromHistory(ctx) + helpers.FailOnError("", err) + + // Open (or create if not present) csv files (in APPEND mode) to store list of distinct URLs and pruning data identifiers + // NOTE: This will create CSV files in the /home directory of the lantern-back-end-endpoint_manager-1 container + distinctURLfile, err := os.OpenFile("/home/distinctURLsFromHistory.csv", os.O_APPEND|os.O_CREATE|os.O_RDWR, 0644) + if err != nil { + log.Fatalf("Failed to create file: %s", err) + } + defer distinctURLfile.Close() + + // Read the distinctURLsFromHistory file to check whether URLs are already added to it + csvReader := csv.NewReader(distinctURLfile) + csvData, err := csvReader.ReadAll() + if err != nil { + log.Fatalf("Error reading CSV data: %v\n", err) + } + + // Ignore the URLs already added during the pruning data capture operation + if len(csvData) > 0 { + log.Info("Existing distinctURLsFromHistory file detected. URLs already present in this file will be ignored.") + existingDistinctURLs = flatten2D(csvData) + } + + duplicateInfoHistoryFile, err := os.OpenFile("/home/duplicateInfoHistoryIds.csv", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + log.Fatalf("Failed to create file: %s", err) + } + defer duplicateInfoHistoryFile.Close() + + // Create CSV writers + distinctURLWriter := csv.NewWriter(distinctURLfile) + duplicateInfoHistoryDataWriter := csv.NewWriter(duplicateInfoHistoryFile) + + log.Info("Starting the duplicate info history data capture.") + + for distinctURLrows.Next() { + + url := getDistinctRowInfo(distinctURLrows) + URLCaptured = false + + // Check whether duplicate data is already captured for the given URL + for idx, val := range existingDistinctURLs { + if url == val { + log.Info("Duplicate info history data already captured. Ignoring URL: ", url) + + // Set the flag + URLCaptured = true + + // Remove the URL from the list of existing URLs + existingDistinctURLs = append(existingDistinctURLs[:idx], existingDistinctURLs[idx+1:]...) + break + } + } + + // Skip the current iteration if duplicate data is already captured + if URLCaptured { + continue + } + + rows, err = store.PruningGetInfoHistoryUsingURL(ctx, queryInterval, url) + helpers.FailOnError("", err) + + if !rows.Next() { + return + } + + var pruningData [][]string + _, fhirURL1, _, capStat1, tlsVersion1, mimeTypes1, smartResponse1, _, requestedFhirVersion1 := getRowInfo(rows) + + for rows.Next() { + log.Info("Info History Row Count: ", historyRowCount) + historyRowCount++ + operation2, fhirURL2, entryDate2, capStat2, tlsVersion2, mimeTypes2, smartResponse2, valResID2, requestedFhirVersion2 := getRowInfo(rows) + + equalFhirEntries := fhirURL1 == fhirURL2 + + if equalFhirEntries { + equalFhirEntries = (requestedFhirVersion1 == requestedFhirVersion2) + + if equalFhirEntries { + equalFhirEntries = (tlsVersion1 == tlsVersion2) + + if equalFhirEntries { + equalFhirEntries = helpers.StringArraysEqual(mimeTypes1, mimeTypes2) + + if equalFhirEntries { + // If capstat is not null check if current entry that was passed in has capstat equal to capstat of old entry being checked from history table, otherwise check they are both null + if capStat1 != nil { + equalFhirEntries = capStat1.EqualIgnore(capStat2) + } else { + equalFhirEntries = (capStat2 == nil) + } + + if equalFhirEntries { + // If smartresponse is not null check if current entry that was passed in has smartresponse equal to smartresponse of old entry being checked from history table, otherwise check they are both null + if smartResponse1 != nil { + ignoredFields := []string{} + equalFhirEntries = smartResponse1.EqualIgnore(smartResponse2, ignoredFields) + } else { + equalFhirEntries = (smartResponse2 == nil) + } + } + } + } + } + } + + if equalFhirEntries && operation2 == "U" { + log.Info("Duplicate Info History Row Count: ", historyDuplicateRowCount) + historyDuplicateRowCount++ + log.Infof("Duplicate Data Captured :: URL: %s, Entered At: %s, Requested FHIR Version: %s, Validation Result ID: %s", fhirURL2, entryDate2, requestedFhirVersion2, strconv.Itoa(valResID2)) + pruningData = append(pruningData, []string{fhirURL2, entryDate2, requestedFhirVersion2, strconv.Itoa(valResID2)}) + } else { + fhirURL1 = fhirURL2 + capStat1 = capStat2 + tlsVersion1 = tlsVersion2 + mimeTypes1 = mimeTypes2 + smartResponse1 = smartResponse2 + requestedFhirVersion1 = requestedFhirVersion2 + continue + } + } + + err = duplicateInfoHistoryDataWriter.WriteAll(pruningData) + if err != nil { + log.Fatal("Error writing to duplicateInfoHistoryDataWriter:", err) + } + + duplicateInfoHistoryDataWriter.Flush() + if err := duplicateInfoHistoryDataWriter.Error(); err != nil { + log.Fatal("Error flushing duplicateInfoHistoryDataWriter:", err) + } + + err = distinctURLWriter.Write([]string{url}) + if err != nil { + log.Fatal("Error writing to distinctURLWriter:", err) + } + + distinctURLWriter.Flush() + if err := distinctURLWriter.Error(); err != nil { + log.Fatal("Error flushing distinctURLWriter:", err) + } + } +} + +// flatten2D converts a 2D slice to a 1D slice +func flatten2D(data2D [][]string) []string { + var data1D []string + for _, row := range data2D { + data1D = append(data1D, row...) + } + return data1D +} + +func getDistinctRowInfo(rows *sql.Rows) string { + var url string + + err := rows.Scan(&url) + helpers.FailOnError("", err) + + return url +} + +func getRowInfo(rows *sql.Rows) (string, string, string, capabilityparser.CapabilityStatement, string, []string, smartparser.SMARTResponse, int, string) { + var capInt map[string]interface{} + var fhirURL string + var operation string + var capStatJSON []byte + var entryDate string + var tlsVersion string + var mimeTypes []string + var smartResponseJSON []byte + var smartResponseInt map[string]interface{} + var valResIDNullable sql.NullInt64 + var valResID int + var requestedFhirVersion string + + err := rows.Scan(&operation, &fhirURL, &capStatJSON, &entryDate, &tlsVersion, pq.Array(&mimeTypes), &smartResponseJSON, &valResIDNullable, &requestedFhirVersion) + helpers.FailOnError("", err) + + if !valResIDNullable.Valid { + valResID = 0 + } else { + valResID = int(valResIDNullable.Int64) + } + + err = json.Unmarshal(capStatJSON, &capInt) + helpers.FailOnError("", err) + capStat, err := capabilityparser.NewCapabilityStatementFromInterface(capInt) + helpers.FailOnError("", err) + + err = json.Unmarshal(smartResponseJSON, &smartResponseInt) + helpers.FailOnError("", err) + smartResponse := smartparser.NewSMARTRespFromInterface(smartResponseInt) + + return operation, fhirURL, entryDate, capStat, tlsVersion, mimeTypes, smartResponse, valResID, requestedFhirVersion +} diff --git a/scripts/duplicate_info_history_check.sh b/scripts/duplicate_info_history_check.sh new file mode 100644 index 000000000..0aaf61c68 --- /dev/null +++ b/scripts/duplicate_info_history_check.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +docker exec --workdir /go/src/app/cmd/historycleanup lantern-back-end_endpoint_manager_1 go run main.go || echo "Failed to run duplicate info history check script" \ No newline at end of file diff --git a/scripts/history_cleanup.sh b/scripts/history_cleanup.sh new file mode 100644 index 000000000..3b53f42a8 --- /dev/null +++ b/scripts/history_cleanup.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +csv_file="../duplicateInfoHistoryIds.csv" +DB_NAME=lantern +DB_USER=lantern + +# Check if the file exists +if [ ! -f "$csv_file" ]; then + echo "File $csv_file not found!" + exit 1 +fi + +while IFS=',' read -r col1 col2 col3 col4; do + DATE=$(date) + echo "($DATE) Deleting entries for data: $col1, $col2, $col3, $col4" + + # Delete entry from the info history table + QUERY=$(echo "DELETE FROM fhir_endpoints_info_history WHERE url='$col1' AND operation='U' AND requested_fhir_version='$col3' AND entered_at = '$col2';") + (docker exec -t lantern-back-end_postgres_1 psql -t -U${DB_USER} -d ${DB_NAME} -c "${QUERY}") || echo "Error deleting entry from the info history table" + +done < "$csv_file" + +echo "Duplicate info history data cleanup complete." diff --git a/scripts/validations_cleanup.sh b/scripts/validations_cleanup.sh new file mode 100644 index 000000000..56348fee7 --- /dev/null +++ b/scripts/validations_cleanup.sh @@ -0,0 +1,52 @@ +#!/bin/sh + +csv_file="../duplicateInfoHistoryIds.csv" +DB_NAME=lantern +DB_USER=lantern + +# Check if the file exists +if [ ! -f "$csv_file" ]; then + echo "File $csv_file not found!" + exit 1 +fi + +# Initial a variable that will hold the validation_result_id from the previous entry. +VAL_RES_ID=-1 + +while IFS=',' read -r col1 col2 col3 col4; do + + # If the validation_result_id is not 0 and not already processed, then perform the deletion + if [ "${col4}" -ne "0" ] && [ "${VAL_RES_ID}" -ne "${col4}" ]; then + + VAL_RES_ID=$col4 + + # Check whether there are entries in the history table having the given validation_result_id and operation = 'I' + QUERY=$(echo "SELECT COUNT(*) FROM fhir_endpoints_info_history WHERE operation='I' AND validation_result_id='$col4';") + COUNT=$(docker exec -t lantern-back-end_postgres_1 psql -t -U${DB_USER} -d ${DB_NAME} -c "${QUERY}") || echo "Error counting entries from the history table" + + # Delete corresponding entries from the validations and validation_results tables ONLY IF the count is zero. + NUMBER=$(echo ${COUNT} | tr -cd '[[:digit:]]') + if [ "${NUMBER}" -eq "0" ]; then + echo "($(date)) Deleting entries from the validations table for validation_result_id: $col4" + + # Delete corresponding entry from the validations table + QUERY=$(echo "DELETE FROM validations WHERE validation_result_id = '$col4';") + (docker exec -t lantern-back-end_postgres_1 psql -t -U${DB_USER} -d ${DB_NAME} -c "${QUERY}") || echo "Error deleting entry from the validations table" + + # Check whether there are entries in the info table having the given validation_result_id + QUERY=$(echo "SELECT COUNT(*) FROM fhir_endpoints_info WHERE validation_result_id='$col4';") + COUNT=$(docker exec -t lantern-back-end_postgres_1 psql -t -U${DB_USER} -d ${DB_NAME} -c "${QUERY}") || echo "Error counting entries from the validation_results table" + + # Delete corresponding entry from the validation results table ONLY IF the count is zero. + NUMBER=$(echo ${COUNT} | tr -cd '[[:digit:]]') + if [ "${NUMBER}" -eq "0" ]; then + echo "($(date)) Deleting entries from the validation_results table for id: $col4" + + QUERY=$(echo "DELETE FROM validation_results WHERE id = '$col4';") + (docker exec -t lantern-back-end_postgres_1 psql -t -U${DB_USER} -d ${DB_NAME} -c "${QUERY}") || echo "Error deleting entry from the validation_results table" + fi + fi + fi +done < "$csv_file" + +echo "Validation data cleanup complete."