Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LANTERN-715: History Cleanup #379

Merged
merged 5 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,44 @@ To configure this script to run using cron, do:
* To display all scheduled cron jobs for the current user, you can use `crontab -l`
* You can halt the cron job by opening up the crontab file and commenting out the job with `#` or delete the crontab expression from the crontab file

# Perform History Cleanup

You can perform a manual history cleanup operation which will prune all repetitive entries, determined using comparisons from the history pruning algorithm, present in the fhir_endpoints_info_history table. It will also prune the corresponding entries from the validations and validation_results table. It is a two-step process.

Step 1: Collect the identifiers of repetitive entries

Change directory to the /scripts inside lantern-back-end and run:

```bash
./duplicate_info_history_check.sh
```

This will start capturing the identifiers of repetitive entries in the fhir_endpoints_info_history table and store it in duplicateInfoHistoryIds.csv file inside the /home directory of the lantern-back-end_endpoint_manager_1 container.

To retrieve the csv file, change directory to /lantern-back-end and run:

```bash
docker cp lantern-back-end_endpoint_manager_1:/home/duplicateInfoHistoryIds.csv .
```

Step 2: Perform the history cleanup

Change directory to the /scripts inside lantern-back-end and run:

```bash
./history_cleanup.sh
```

This will start the deletion of data from fhir_endpoints_info_history table using the captured identifiers of repetitive entries.

```bash
./validations_cleanup.sh
```

This will start the deletion of data from validations and validation_results tables using the captured identifiers of repetitive entries.

Note: Ensure that the duplicateInfoHistoryIds.csv file is present in /lantern-back-end before executing the above scripts.

# Running Lantern Services Individually

## Internal Services
Expand Down
25 changes: 25 additions & 0 deletions endpointmanager/cmd/historycleanup/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package main

import (
"context"

"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/config"
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/endpointmanager/postgresql"
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers"
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/historycleanup"
log "github.com/sirupsen/logrus"
"github.com/spf13/viper"
)

func main() {
err := config.SetupConfig()
helpers.FailOnError("", err)

store, err := postgresql.NewStore(viper.GetString("dbhost"), viper.GetInt("dbport"), viper.GetString("dbuser"), viper.GetString("dbpassword"), viper.GetString("dbname"), viper.GetString("dbsslmode"))
helpers.FailOnError("", err)
log.Info("Successfully connected to DB!")

ctx := context.Background()

historycleanup.GetInfoHistoryDuplicateData(ctx, store, true)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package postgresql

import (
"context"
"database/sql"

log "github.com/sirupsen/logrus"
)

var duplicateInfoHistoryStatement *sql.Stmt
var distinctURLStatement *sql.Stmt

// GetDistinctURLs gets a list of ordered distinct URLs from the history table
func (s *Store) GetDistinctURLsFromHistory(ctx context.Context) (*sql.Rows, error) {

log.Info("Inside GetDistinctURLsFromHistory")

var err error

distinctURLStatement, err = s.DB.Prepare(`
select DISTINCT(url) FROM fhir_endpoints_info_history
WHERE (operation='U' OR operation='I')
ORDER BY url;`)

if err != nil {
return nil, err
}

var rows *sql.Rows

rows, err = distinctURLStatement.QueryContext(ctx)

return rows, err
}

// PruningGetInfoHistoryUsingURL gets info history entries matching the given URL for pruning
func (s *Store) PruningGetInfoHistoryUsingURL(ctx context.Context, queryInterval bool, url string) (*sql.Rows, error) {

log.Info("Inside PruningGetInfoHistoryUsingURL")

var err error

duplicateInfoHistoryStatement, err = s.DB.Prepare(`
SELECT operation, url, capability_statement, entered_at, tls_version, mime_types, smart_response, validation_result_id, requested_fhir_version FROM fhir_endpoints_info_history
WHERE (operation='U' OR operation='I') AND url = $1
ORDER BY entered_at ASC;`)

if err != nil {
return nil, err
}

var rows *sql.Rows

rows, err = duplicateInfoHistoryStatement.QueryContext(ctx, url)

return rows, err
}
229 changes: 229 additions & 0 deletions endpointmanager/pkg/historycleanup/historycleanup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
package historycleanup

import (
"context"
"database/sql"
"encoding/csv"
"encoding/json"
"os"
"strconv"

"github.com/lib/pq"
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/capabilityparser"
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/endpointmanager/postgresql"
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers"
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/smartparser"
log "github.com/sirupsen/logrus"
)

// GetInfoHistoryDuplicateData checks info table and stores the identifiers of any repetitive entries in CSV files
func GetInfoHistoryDuplicateData(ctx context.Context, store *postgresql.Store, queryInterval bool) {

historyRowCount := 1
historyDuplicateRowCount := 1

var rows *sql.Rows
var distinctURLrows *sql.Rows
var err error
var existingDistinctURLs []string
var URLCaptured bool

// Get distinct URLs from the history table
distinctURLrows, err = store.GetDistinctURLsFromHistory(ctx)
helpers.FailOnError("", err)

// Open (or create if not present) csv files (in APPEND mode) to store list of distinct URLs and pruning data identifiers
// NOTE: This will create CSV files in the /home directory of the lantern-back-end-endpoint_manager-1 container
distinctURLfile, err := os.OpenFile("/home/distinctURLsFromHistory.csv", os.O_APPEND|os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
log.Fatalf("Failed to create file: %s", err)
}
defer distinctURLfile.Close()

// Read the distinctURLsFromHistory file to check whether URLs are already added to it
csvReader := csv.NewReader(distinctURLfile)
csvData, err := csvReader.ReadAll()
if err != nil {
log.Fatalf("Error reading CSV data: %v\n", err)
}

// Ignore the URLs already added during the pruning data capture operation
if len(csvData) > 0 {
log.Info("Existing distinctURLsFromHistory file detected. URLs already present in this file will be ignored.")
existingDistinctURLs = flatten2D(csvData)
}

duplicateInfoHistoryFile, err := os.OpenFile("/home/duplicateInfoHistoryIds.csv", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Fatalf("Failed to create file: %s", err)
}
defer duplicateInfoHistoryFile.Close()

// Create CSV writers
distinctURLWriter := csv.NewWriter(distinctURLfile)
duplicateInfoHistoryDataWriter := csv.NewWriter(duplicateInfoHistoryFile)

log.Info("Starting the duplicate info history data capture.")

for distinctURLrows.Next() {

url := getDistinctRowInfo(distinctURLrows)
URLCaptured = false

// Check whether duplicate data is already captured for the given URL
for idx, val := range existingDistinctURLs {
if url == val {
log.Info("Duplicate info history data already captured. Ignoring URL: ", url)

// Set the flag
URLCaptured = true

// Remove the URL from the list of existing URLs
existingDistinctURLs = append(existingDistinctURLs[:idx], existingDistinctURLs[idx+1:]...)
break
}
}

// Skip the current iteration if duplicate data is already captured
if URLCaptured {
continue
}

rows, err = store.PruningGetInfoHistoryUsingURL(ctx, queryInterval, url)
helpers.FailOnError("", err)

if !rows.Next() {
return
}

var pruningData [][]string
_, fhirURL1, _, capStat1, tlsVersion1, mimeTypes1, smartResponse1, _, requestedFhirVersion1 := getRowInfo(rows)

for rows.Next() {
log.Info("Info History Row Count: ", historyRowCount)
historyRowCount++
operation2, fhirURL2, entryDate2, capStat2, tlsVersion2, mimeTypes2, smartResponse2, valResID2, requestedFhirVersion2 := getRowInfo(rows)

equalFhirEntries := fhirURL1 == fhirURL2

if equalFhirEntries {
equalFhirEntries = (requestedFhirVersion1 == requestedFhirVersion2)

if equalFhirEntries {
equalFhirEntries = (tlsVersion1 == tlsVersion2)

if equalFhirEntries {
equalFhirEntries = helpers.StringArraysEqual(mimeTypes1, mimeTypes2)

if equalFhirEntries {
// If capstat is not null check if current entry that was passed in has capstat equal to capstat of old entry being checked from history table, otherwise check they are both null
if capStat1 != nil {
equalFhirEntries = capStat1.EqualIgnore(capStat2)
} else {
equalFhirEntries = (capStat2 == nil)
}

if equalFhirEntries {
// If smartresponse is not null check if current entry that was passed in has smartresponse equal to smartresponse of old entry being checked from history table, otherwise check they are both null
if smartResponse1 != nil {
ignoredFields := []string{}
equalFhirEntries = smartResponse1.EqualIgnore(smartResponse2, ignoredFields)
} else {
equalFhirEntries = (smartResponse2 == nil)
}
}
}
}
}
}

if equalFhirEntries && operation2 == "U" {
log.Info("Duplicate Info History Row Count: ", historyDuplicateRowCount)
historyDuplicateRowCount++
log.Infof("Duplicate Data Captured :: URL: %s, Entered At: %s, Requested FHIR Version: %s, Validation Result ID: %s", fhirURL2, entryDate2, requestedFhirVersion2, strconv.Itoa(valResID2))
pruningData = append(pruningData, []string{fhirURL2, entryDate2, requestedFhirVersion2, strconv.Itoa(valResID2)})
} else {
fhirURL1 = fhirURL2
capStat1 = capStat2
tlsVersion1 = tlsVersion2
mimeTypes1 = mimeTypes2
smartResponse1 = smartResponse2
requestedFhirVersion1 = requestedFhirVersion2
continue
}
}

err = duplicateInfoHistoryDataWriter.WriteAll(pruningData)
if err != nil {
log.Fatal("Error writing to duplicateInfoHistoryDataWriter:", err)
}

duplicateInfoHistoryDataWriter.Flush()
if err := duplicateInfoHistoryDataWriter.Error(); err != nil {
log.Fatal("Error flushing duplicateInfoHistoryDataWriter:", err)
}

err = distinctURLWriter.Write([]string{url})
if err != nil {
log.Fatal("Error writing to distinctURLWriter:", err)
}

distinctURLWriter.Flush()
if err := distinctURLWriter.Error(); err != nil {
log.Fatal("Error flushing distinctURLWriter:", err)
}
}
}

// flatten2D converts a 2D slice to a 1D slice
func flatten2D(data2D [][]string) []string {
var data1D []string
for _, row := range data2D {
data1D = append(data1D, row...)
}
return data1D
}

func getDistinctRowInfo(rows *sql.Rows) string {
var url string

err := rows.Scan(&url)
helpers.FailOnError("", err)

return url
}

func getRowInfo(rows *sql.Rows) (string, string, string, capabilityparser.CapabilityStatement, string, []string, smartparser.SMARTResponse, int, string) {
var capInt map[string]interface{}
var fhirURL string
var operation string
var capStatJSON []byte
var entryDate string
var tlsVersion string
var mimeTypes []string
var smartResponseJSON []byte
var smartResponseInt map[string]interface{}
var valResIDNullable sql.NullInt64
var valResID int
var requestedFhirVersion string

err := rows.Scan(&operation, &fhirURL, &capStatJSON, &entryDate, &tlsVersion, pq.Array(&mimeTypes), &smartResponseJSON, &valResIDNullable, &requestedFhirVersion)
helpers.FailOnError("", err)

if !valResIDNullable.Valid {
valResID = 0
} else {
valResID = int(valResIDNullable.Int64)
}

err = json.Unmarshal(capStatJSON, &capInt)
helpers.FailOnError("", err)
capStat, err := capabilityparser.NewCapabilityStatementFromInterface(capInt)
helpers.FailOnError("", err)

err = json.Unmarshal(smartResponseJSON, &smartResponseInt)
helpers.FailOnError("", err)
smartResponse := smartparser.NewSMARTRespFromInterface(smartResponseInt)

return operation, fhirURL, entryDate, capStat, tlsVersion, mimeTypes, smartResponse, valResID, requestedFhirVersion
}
3 changes: 3 additions & 0 deletions scripts/duplicate_info_history_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh

docker exec --workdir /go/src/app/cmd/historycleanup lantern-back-end_endpoint_manager_1 go run main.go || echo "Failed to run duplicate info history check script"
23 changes: 23 additions & 0 deletions scripts/history_cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/sh

csv_file="../duplicateInfoHistoryIds.csv"
DB_NAME=lantern
DB_USER=lantern

# Check if the file exists
if [ ! -f "$csv_file" ]; then
echo "File $csv_file not found!"
exit 1
fi

while IFS=',' read -r col1 col2 col3 col4; do
DATE=$(date)
echo "($DATE) Deleting entries for data: $col1, $col2, $col3, $col4"

# Delete entry from the info history table
QUERY=$(echo "DELETE FROM fhir_endpoints_info_history WHERE url='$col1' AND operation='U' AND requested_fhir_version='$col3' AND entered_at = '$col2';")
(docker exec -t lantern-back-end_postgres_1 psql -t -U${DB_USER} -d ${DB_NAME} -c "${QUERY}") || echo "Error deleting entry from the info history table"

done < "$csv_file"

echo "Duplicate info history data cleanup complete."
Loading
Loading