onc-healthit · vishnu-mettles · Aug 8, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 31, 2024
diff --git a/README.md b/README.md
@@ -347,6 +347,44 @@ To configure this script to run using cron, do:
  * To display all scheduled cron jobs for the current user, you can use `crontab -l`
  * You can halt the cron job by opening up the crontab file and commenting out the job with `#` or delete the crontab expression from the crontab file
 
+# Perform History Cleanup
+
+You can perform a manual history cleanup operation which will prune all repetitive entries, determined using comparisons from the history pruning algorithm, present in the fhir_endpoints_info_history table. It will also prune the corresponding entries from the validations and validation_results table. It is a two-step process.
+
+Step 1: Collect the identifiers of repetitive entries
+
+Change directory to the /scripts inside lantern-back-end and run:
+
+ ```bash
+    ./duplicate_info_history_check.sh
+  ```
+
+This will start capturing the identifiers of repetitive entries in the fhir_endpoints_info_history table and store it in duplicateInfoHistoryIds.csv file inside the /home directory of the lantern-back-end_endpoint_manager_1 container.
+
+To retrieve the csv file, change directory to /lantern-back-end and run:
+
+ ```bash
+    docker cp lantern-back-end_endpoint_manager_1:/home/duplicateInfoHistoryIds.csv .
+  ```
+
+Step 2: Perform the history cleanup
+
+Change directory to the /scripts inside lantern-back-end and run:
+
+ ```bash
+    ./history_cleanup.sh
+  ```
+
+This will start the deletion of data from fhir_endpoints_info_history table using the captured identifiers of repetitive entries.
+
+ ```bash
+    ./validations_cleanup.sh
+  ```
+
+This will start the deletion of data from validations and validation_results tables using the captured identifiers of repetitive entries.
+
+Note: Ensure that the duplicateInfoHistoryIds.csv file is present in /lantern-back-end before executing the above scripts.
+
 # Running Lantern Services Individually
 
 ## Internal Services

diff --git a/endpointmanager/cmd/historycleanup/main.go b/endpointmanager/cmd/historycleanup/main.go
@@ -0,0 +1,25 @@
+package main
+
+import (
+	"context"
+
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/config"
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/endpointmanager/postgresql"
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers"
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/historycleanup"
+	log "github.com/sirupsen/logrus"
+	"github.com/spf13/viper"
+)
+
+func main() {
+	err := config.SetupConfig()
+	helpers.FailOnError("", err)
+
+	store, err := postgresql.NewStore(viper.GetString("dbhost"), viper.GetInt("dbport"), viper.GetString("dbuser"), viper.GetString("dbpassword"), viper.GetString("dbname"), viper.GetString("dbsslmode"))
+	helpers.FailOnError("", err)
+	log.Info("Successfully connected to DB!")
+
+	ctx := context.Background()
+
+	historycleanup.GetInfoHistoryDuplicateData(ctx, store, true)
+}
diff --git a/endpointmanager/pkg/endpointmanager/postgresql/historycleanupstore.go b/endpointmanager/pkg/endpointmanager/postgresql/historycleanupstore.go
@@ -0,0 +1,57 @@
+package postgresql
+
+import (
+	"context"
+	"database/sql"
+
+	log "github.com/sirupsen/logrus"
+)
+
+var duplicateInfoHistoryStatement *sql.Stmt
+var distinctURLStatement *sql.Stmt
+
+// GetDistinctURLs gets a list of ordered distinct URLs from the history table
+func (s *Store) GetDistinctURLsFromHistory(ctx context.Context) (*sql.Rows, error) {
+
+	log.Info("Inside GetDistinctURLsFromHistory")
+
+	var err error
+
+	distinctURLStatement, err = s.DB.Prepare(`
+		select DISTINCT(url) FROM fhir_endpoints_info_history
+		WHERE (operation='U' OR operation='I')
+		ORDER BY url;`)
+
+	if err != nil {
+		return nil, err
+	}
+
+	var rows *sql.Rows
+
+	rows, err = distinctURLStatement.QueryContext(ctx)
+
+	return rows, err
+}
+
+// PruningGetInfoHistoryUsingURL gets info history entries matching the given URL for pruning
+func (s *Store) PruningGetInfoHistoryUsingURL(ctx context.Context, queryInterval bool, url string) (*sql.Rows, error) {
+
+	log.Info("Inside PruningGetInfoHistoryUsingURL")
+
+	var err error
+
+	duplicateInfoHistoryStatement, err = s.DB.Prepare(`
+		SELECT operation, url, capability_statement, entered_at, tls_version, mime_types, smart_response, validation_result_id, requested_fhir_version FROM fhir_endpoints_info_history 
+		WHERE (operation='U' OR operation='I') AND url = $1
+		ORDER BY entered_at ASC;`)
+
+	if err != nil {
+		return nil, err
+	}
+
+	var rows *sql.Rows
+
+	rows, err = duplicateInfoHistoryStatement.QueryContext(ctx, url)
+
+	return rows, err
+}
diff --git a/endpointmanager/pkg/historycleanup/historycleanup.go b/endpointmanager/pkg/historycleanup/historycleanup.go
@@ -0,0 +1,229 @@
+package historycleanup
+
+import (
+	"context"
+	"database/sql"
+	"encoding/csv"
+	"encoding/json"
+	"os"
+	"strconv"
+
+	"github.com/lib/pq"
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/capabilityparser"
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/endpointmanager/postgresql"
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers"
+	"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/smartparser"
+	log "github.com/sirupsen/logrus"
+)
+
+// GetInfoHistoryDuplicateData checks info table and stores the identifiers of any repetitive entries in CSV files
+func GetInfoHistoryDuplicateData(ctx context.Context, store *postgresql.Store, queryInterval bool) {
+
+	historyRowCount := 1
+	historyDuplicateRowCount := 1
+
+	var rows *sql.Rows
+	var distinctURLrows *sql.Rows
+	var err error
+	var existingDistinctURLs []string
+	var URLCaptured bool
+
+	// Get distinct URLs from the history table
+	distinctURLrows, err = store.GetDistinctURLsFromHistory(ctx)
+	helpers.FailOnError("", err)
+
+	// Open (or create if not present) csv files (in APPEND mode) to store list of distinct URLs and pruning data identifiers
+	// NOTE: This will create CSV files in the /home directory of the lantern-back-end-endpoint_manager-1 container
+	distinctURLfile, err := os.OpenFile("/home/distinctURLsFromHistory.csv", os.O_APPEND|os.O_CREATE|os.O_RDWR, 0644)
+	if err != nil {
+		log.Fatalf("Failed to create file: %s", err)
+	}
+	defer distinctURLfile.Close()
+
+	// Read the distinctURLsFromHistory file to check whether URLs are already added to it
+	csvReader := csv.NewReader(distinctURLfile)
+	csvData, err := csvReader.ReadAll()
+	if err != nil {
+		log.Fatalf("Error reading CSV data: %v\n", err)
+	}
+
+	// Ignore the URLs already added during the pruning data capture operation
+	if len(csvData) > 0 {
+		log.Info("Existing distinctURLsFromHistory file detected. URLs already present in this file will be ignored.")
+		existingDistinctURLs = flatten2D(csvData)
+	}
+
+	duplicateInfoHistoryFile, err := os.OpenFile("/home/duplicateInfoHistoryIds.csv", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		log.Fatalf("Failed to create file: %s", err)
+	}
+	defer duplicateInfoHistoryFile.Close()
+
+	// Create CSV writers
+	distinctURLWriter := csv.NewWriter(distinctURLfile)
+	duplicateInfoHistoryDataWriter := csv.NewWriter(duplicateInfoHistoryFile)
+
+	log.Info("Starting the duplicate info history data capture.")
+
+	for distinctURLrows.Next() {
+
+		url := getDistinctRowInfo(distinctURLrows)
+		URLCaptured = false
+
+		// Check whether duplicate data is already captured for the given URL
+		for idx, val := range existingDistinctURLs {
+			if url == val {
+				log.Info("Duplicate info history data already captured. Ignoring URL: ", url)
+
+				// Set the flag
+				URLCaptured = true
+
+				// Remove the URL from the list of existing URLs
+				existingDistinctURLs = append(existingDistinctURLs[:idx], existingDistinctURLs[idx+1:]...)
+				break
+			}
+		}
+
+		// Skip the current iteration if duplicate data is already captured
+		if URLCaptured {
+			continue
+		}
+
+		rows, err = store.PruningGetInfoHistoryUsingURL(ctx, queryInterval, url)
+		helpers.FailOnError("", err)
+
+		if !rows.Next() {
+			return
+		}
+
+		var pruningData [][]string
+		_, fhirURL1, _, capStat1, tlsVersion1, mimeTypes1, smartResponse1, _, requestedFhirVersion1 := getRowInfo(rows)
+
+		for rows.Next() {
+			log.Info("Info History Row Count: ", historyRowCount)
+			historyRowCount++
+			operation2, fhirURL2, entryDate2, capStat2, tlsVersion2, mimeTypes2, smartResponse2, valResID2, requestedFhirVersion2 := getRowInfo(rows)
+
+			equalFhirEntries := fhirURL1 == fhirURL2
+
+			if equalFhirEntries {
+				equalFhirEntries = (requestedFhirVersion1 == requestedFhirVersion2)
+
+				if equalFhirEntries {
+					equalFhirEntries = (tlsVersion1 == tlsVersion2)
+
+					if equalFhirEntries {
+						equalFhirEntries = helpers.StringArraysEqual(mimeTypes1, mimeTypes2)
+
+						if equalFhirEntries {
+							// If capstat is not null check if current entry that was passed in has capstat equal to capstat of old entry being checked from history table, otherwise check they are both null
+							if capStat1 != nil {
+								equalFhirEntries = capStat1.EqualIgnore(capStat2)
+							} else {
+								equalFhirEntries = (capStat2 == nil)
+							}
+
+							if equalFhirEntries {
+								// If smartresponse is not null check if current entry that was passed in has smartresponse equal to smartresponse of old entry being checked from history table, otherwise check they are both null
+								if smartResponse1 != nil {
+									ignoredFields := []string{}
+									equalFhirEntries = smartResponse1.EqualIgnore(smartResponse2, ignoredFields)
+								} else {
+									equalFhirEntries = (smartResponse2 == nil)
+								}
+							}
+						}
+					}
+				}
+			}
+
+			if equalFhirEntries && operation2 == "U" {
+				log.Info("Duplicate Info History Row Count: ", historyDuplicateRowCount)
+				historyDuplicateRowCount++
+				log.Infof("Duplicate Data Captured :: URL: %s, Entered At: %s, Requested FHIR Version: %s, Validation Result ID: %s", fhirURL2, entryDate2, requestedFhirVersion2, strconv.Itoa(valResID2))
+				pruningData = append(pruningData, []string{fhirURL2, entryDate2, requestedFhirVersion2, strconv.Itoa(valResID2)})
+			} else {
+				fhirURL1 = fhirURL2
+				capStat1 = capStat2
+				tlsVersion1 = tlsVersion2
+				mimeTypes1 = mimeTypes2
+				smartResponse1 = smartResponse2
+				requestedFhirVersion1 = requestedFhirVersion2
+				continue
+			}
+		}
+
+		err = duplicateInfoHistoryDataWriter.WriteAll(pruningData)
+		if err != nil {
+			log.Fatal("Error writing to duplicateInfoHistoryDataWriter:", err)
+		}
+
+		duplicateInfoHistoryDataWriter.Flush()
+		if err := duplicateInfoHistoryDataWriter.Error(); err != nil {
+			log.Fatal("Error flushing duplicateInfoHistoryDataWriter:", err)
+		}
+
+		err = distinctURLWriter.Write([]string{url})
+		if err != nil {
+			log.Fatal("Error writing to distinctURLWriter:", err)
+		}
+
+		distinctURLWriter.Flush()
+		if err := distinctURLWriter.Error(); err != nil {
+			log.Fatal("Error flushing distinctURLWriter:", err)
+		}
+	}
+}
+
+// flatten2D converts a 2D slice to a 1D slice
+func flatten2D(data2D [][]string) []string {
+	var data1D []string
+	for _, row := range data2D {
+		data1D = append(data1D, row...)
+	}
+	return data1D
+}
+
+func getDistinctRowInfo(rows *sql.Rows) string {
+	var url string
+
+	err := rows.Scan(&url)
+	helpers.FailOnError("", err)
+
+	return url
+}
+
+func getRowInfo(rows *sql.Rows) (string, string, string, capabilityparser.CapabilityStatement, string, []string, smartparser.SMARTResponse, int, string) {
+	var capInt map[string]interface{}
+	var fhirURL string
+	var operation string
+	var capStatJSON []byte
+	var entryDate string
+	var tlsVersion string
+	var mimeTypes []string
+	var smartResponseJSON []byte
+	var smartResponseInt map[string]interface{}
+	var valResIDNullable sql.NullInt64
+	var valResID int
+	var requestedFhirVersion string
+
+	err := rows.Scan(&operation, &fhirURL, &capStatJSON, &entryDate, &tlsVersion, pq.Array(&mimeTypes), &smartResponseJSON, &valResIDNullable, &requestedFhirVersion)
+	helpers.FailOnError("", err)
+
+	if !valResIDNullable.Valid {
+		valResID = 0
+	} else {
+		valResID = int(valResIDNullable.Int64)
+	}
+
+	err = json.Unmarshal(capStatJSON, &capInt)
+	helpers.FailOnError("", err)
+	capStat, err := capabilityparser.NewCapabilityStatementFromInterface(capInt)
+	helpers.FailOnError("", err)
+
+	err = json.Unmarshal(smartResponseJSON, &smartResponseInt)
+	helpers.FailOnError("", err)
+	smartResponse := smartparser.NewSMARTRespFromInterface(smartResponseInt)
+
+	return operation, fhirURL, entryDate, capStat, tlsVersion, mimeTypes, smartResponse, valResID, requestedFhirVersion
+}
diff --git a/scripts/duplicate_info_history_check.sh b/scripts/duplicate_info_history_check.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+docker exec --workdir /go/src/app/cmd/historycleanup lantern-back-end_endpoint_manager_1 go run main.go || echo "Failed to run duplicate info history check script"
diff --git a/scripts/history_cleanup.sh b/scripts/history_cleanup.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+csv_file="../duplicateInfoHistoryIds.csv"
+DB_NAME=lantern
+DB_USER=lantern
+
+# Check if the file exists
+if [ ! -f "$csv_file" ]; then
+    echo "File $csv_file not found!"
+    exit 1
+fi
+
+while IFS=',' read -r col1 col2 col3 col4; do
+    DATE=$(date)
+    echo "($DATE) Deleting entries for data: $col1, $col2, $col3, $col4"
+
+    # Delete entry from the info history table
+    QUERY=$(echo "DELETE FROM fhir_endpoints_info_history WHERE url='$col1' AND operation='U' AND requested_fhir_version='$col3' AND entered_at = '$col2';")
+    (docker exec -t lantern-back-end_postgres_1 psql -t -U${DB_USER} -d ${DB_NAME} -c "${QUERY}") || echo "Error deleting entry from the info history table"
+
+done < "$csv_file"
+
+echo "Duplicate info history data cleanup complete."
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/sh

		docker exec --workdir /go/src/app/cmd/historycleanup lantern-back-end_endpoint_manager_1 go run main.go \|\| echo "Failed to run duplicate info history check script"