diff --git a/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs b/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs index 9b3609bbe8..11fcb75d81 100644 --- a/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs +++ b/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs @@ -386,7 +386,7 @@ private T Activate(T2 databaseObject, Image tabImage) uiInstance.SetDatabaseObject(this, databaseObject); - if (insertIndex is not null) + if (insertIndex is not null && _mainDockPanel.ActivePane is not null) { _mainDockPanel.ActivePane.SetContentIndex(floatable, (int)insertIndex); } diff --git a/CHANGELOG.md b/CHANGELOG.md index f56222773f..737ed346d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - Build on and target .Net 9 rather than 8 +- Add DQE Updater Mutilator for Data Loads see [DQE Post Load runner](./Documentation/DataLoadEngine/DQEPostLoadRunner.md) + ## [8.4.2] - 2024-12-18 - Fix issue with MEF constructing Remote Table Attachers diff --git a/Documentation/DataLoadEngine/DQEPostLoadRunner.md b/Documentation/DataLoadEngine/DQEPostLoadRunner.md new file mode 100644 index 0000000000..d244e87a4a --- /dev/null +++ b/Documentation/DataLoadEngine/DQEPostLoadRunner.md @@ -0,0 +1,12 @@ +# DQE Post Load Runner + +The DQE post-load runner can be used to automatically perform a DQE update once a data load completes. +The runner attempts to reuse any existing DQE results that have been unaffected by the data load, however this process can still be slow if the catalogue data is large and/or complex. + +## Requirements +The DQE post-load runner requires an existing DQE result to exist, otherwise it will fail. + +## Configuration +The runner makes a number of queries to the database, the timeout for these commands is configurable via the timeout option. + + diff --git a/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs new file mode 100644 index 0000000000..e58425ba1a --- /dev/null +++ b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs @@ -0,0 +1,415 @@ +using NPOI.SS.Formula.Functions; +using NUnit.Framework; +using Rdmp.Core.Curation; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.Curation.Data.DataLoad; +using Rdmp.Core.Curation.Data.Defaults; +using Rdmp.Core.DataFlowPipeline; +using Rdmp.Core.DataLoad; +using Rdmp.Core.DataLoad.Engine.Checks.Checkers; +using Rdmp.Core.DataLoad.Engine.DatabaseManagement.EntityNaming; +using Rdmp.Core.DataLoad.Engine.Job; +using Rdmp.Core.DataLoad.Engine.LoadExecution; +using Rdmp.Core.DataLoad.Engine.LoadProcess; +using Rdmp.Core.DataLoad.Modules.Attachers; +using Rdmp.Core.DataLoad.Modules.DataProvider; +using Rdmp.Core.DataLoad.Modules.Mutilators; +using Rdmp.Core.DataLoad.Triggers; +using Rdmp.Core.DataQualityEngine.Data; +using Rdmp.Core.DataQualityEngine.Reports; +using Rdmp.Core.Repositories; +using Rdmp.Core.ReusableLibraryCode.Checks; +using Rdmp.Core.ReusableLibraryCode.Progress; +using Rdmp.Core.Tests.DataLoad.Engine.Integration; +using System.Collections.Generic; +using System.Data; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Threading; + +namespace Rdmp.Core.Tests.DataQualityEngine +{ + internal class DQEPartialUpdateTests : DataLoadEngineTestsBase + { + + readonly string validatorXML = "\r\n\r\n \r\n \r\n \r\n Wrong\r\n \r\n chi\r\n \r\n \r\n \r\n time\r\n \r\n \r\n \r\n"; + readonly string fileLocation = Path.GetTempPath(); + readonly string fileName = "SteppedDQEPartialUpdates.csv"; + + [Test] + public void SteppedDQEPartialUpdates() + { + var server = GetCleanedServer(FAnsi.DatabaseType.MicrosoftSQLServer); + + var dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Columns.Add("hic_dataLoadRunID"); + dt.Rows.Add(new object[] { "1111111111", "A", "2024-12-01",10 }); + dt.Rows.Add(new object[] { "1111111112", "A", "2024-11-01",10 }); + + var table = server.CreateTable("PartialToaDQE", dt); + table.CreatePrimaryKey(table.DiscoverColumns().Where(c => c.GetRuntimeName() == "chi").ToArray()); + dt.Dispose(); + var catalogue = new Catalogue(CatalogueRepository, "PartialToaDQE"); + var importer = new TableInfoImporter(CatalogueRepository, table); + importer.DoImport(out var _tableInfo, out var _columnInfos); + foreach (var columnInfo in _columnInfos) + { + var ci = new CatalogueItem(CatalogueRepository, catalogue, columnInfo.GetRuntimeName()); + ci.SaveToDatabase(); + var ei = new ExtractionInformation(CatalogueRepository, ci, columnInfo, ""); + ei.SaveToDatabase(); + } + var dqeRepository = new DQERepository(CatalogueRepository); + + catalogue.ValidatorXML = validatorXML; + catalogue.TimeCoverage_ExtractionInformation_ID = catalogue.GetAllExtractionInformation(ExtractionCategory.Any) + .Single(e => e.GetRuntimeName().Equals("time")).ID; + + catalogue.PivotCategory_ExtractionInformation_ID = catalogue.GetAllExtractionInformation(ExtractionCategory.Any) + .Single(e => e.GetRuntimeName().Equals("value")).ID; + + var report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + var source = new CancellationTokenSource(); + + var listener = new ToMemoryDataLoadEventListener(false); + report.GenerateReport(catalogue, listener, source.Token); + var lmd = new LoadMetadata(CatalogueRepository, "MyLoad"); + lmd.LocationOfForLoadingDirectory = Path.GetTempPath(); + lmd.LocationOfForArchivingDirectory = Path.GetTempPath(); + lmd.LocationOfExecutablesDirectory = Path.GetTempPath(); + lmd.LocationOfCacheDirectory = Path.GetTempPath(); + lmd.SaveToDatabase(); + var loggingServer = CatalogueRepository.GetDefaultFor(PermissableDefaults.LiveLoggingServer_ID); + var logManager = new Core.Logging.LogManager(loggingServer); + logManager.CreateNewLoggingTaskIfNotExists(lmd.Name); + catalogue.LoggingDataTask = lmd.Name; + catalogue.SaveToDatabase(); + lmd.LinkToCatalogue(catalogue); + + //fetch files + var fetchDataProcessTask = new ProcessTask(CatalogueRepository, lmd, LoadStage.GetFiles); + fetchDataProcessTask.ProcessTaskType = ProcessTaskType.DataProvider; + fetchDataProcessTask.Path = "Rdmp.Core.DataLoad.Modules.DataProvider.ImportFilesDataProvider"; + fetchDataProcessTask.SaveToDatabase(); + + fetchDataProcessTask.CreateArgumentsForClassIfNotExists(); + fetchDataProcessTask.SetArgumentValue("DirectoryPath", fileLocation); + fetchDataProcessTask.SetArgumentValue("FilePattern", fileName); + fetchDataProcessTask.SaveToDatabase(); + + //load file + var attachProcessTask = new ProcessTask(CatalogueRepository, lmd, LoadStage.Mounting); + attachProcessTask.ProcessTaskType = ProcessTaskType.Attacher; + attachProcessTask.Path = "Rdmp.Core.DataLoad.Modules.Attachers.AnySeparatorFileAttacher"; + attachProcessTask.SaveToDatabase(); + attachProcessTask.CreateArgumentsForClassIfNotExists(); + attachProcessTask.SetArgumentValue("Separator", ","); + attachProcessTask.SetArgumentValue("FilePattern", fileName); + attachProcessTask.SetArgumentValue("TableToLoad", _tableInfo); + attachProcessTask.SaveToDatabase(); + + var dqeUpdate = new ProcessTask(CatalogueRepository, lmd, LoadStage.PostLoad); + dqeUpdate.ProcessTaskType = ProcessTaskType.MutilateDataTable; + dqeUpdate.Path = "Rdmp.Core.DataLoad.Modules.Mutilators.DQEPostLoadRunner"; + dqeUpdate.CreateArgumentsForClassIfNotExists(); + dqeUpdate.SaveToDatabase(); + + //first load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "A", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "B", "2024-10-01" }); + SetupFile(dt); + PerformLoad(lmd, logManager); + //end of first load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + var evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(3)); + CompareEvaluations(evaluations[1], evaluations[2]); + + //second load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "A", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "C", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "D", "2024-10-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of second load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(5)); + CompareEvaluations(evaluations[3], evaluations[4]); + + //third load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "C", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "C", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "A", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "B", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "E", "2024-08-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of third load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(7)); + CompareEvaluations(evaluations[6], evaluations[5]); + + //fourth load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "A", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "A", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "D", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "B", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "C", "2024-08-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of fourth load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList();//.Where(e => e.CatalogueID == catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(9)); + CompareEvaluations(evaluations[8], evaluations[7]); + + //fifth load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "C", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "B", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "D", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "A", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "A", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "A", "2024-08-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of fifth load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(11)); + CompareEvaluations(evaluations[10], evaluations[9]); + + //sixth load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "C", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "B", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "B", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "C", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "D", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "A", "2024-08-01" }); + dt.Rows.Add(new string[] { "1111111117", "A", "2024-07-01" }); + dt.Rows.Add(new string[] { "1111111118", "B", "2024-06-01" }); + dt.Rows.Add(new string[] { "1111111119", "C", "2024-05-01" }); + dt.Rows.Add(new string[] { "1111111120", "D", "2024-04-01" }); + dt.Rows.Add(new string[] { "1111111121", "E", "2024-03-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of sixth load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + source.Dispose(); + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(13)); + CompareEvaluations(evaluations[12], evaluations[11]); + } + + private void SetupFile(DataTable dt) + { + if (File.Exists(Path.Combine(fileLocation, fileName))) + { + File.Delete(Path.Combine(fileLocation, fileName)); + } + var fs = File.Create(Path.Combine(fileLocation, fileName)); + fs.Close(); + var lines = new List() { + string.Join(',',dt.Columns.Cast().Select(c => c.ColumnName)) + }; + foreach (var row in dt.AsEnumerable()) + { + lines.Add(string.Join(',', row.ItemArray.Select(i => i.ToString()))); + } + File.AppendAllLines(Path.Combine(fileLocation, fileName), lines); + dt.Dispose(); + } + + private void CompareEvaluations(Evaluation e1, Evaluation e2) + { + Assert.That(e1.ColumnStates.Length, Is.EqualTo(e2.ColumnStates.Length)); + Assert.That(e1.RowStates.Length, Is.EqualTo(e2.RowStates.Length)); + List columnStateDiff = e1.ColumnStates.Except(e2.ColumnStates, new ColumnStateCompare()).ToList(); + Assert.That(columnStateDiff.Count, Is.EqualTo(0)); + columnStateDiff = e2.ColumnStates.Except(e1.ColumnStates, new ColumnStateCompare()).ToList(); + Assert.That(columnStateDiff.Count, Is.EqualTo(0)); + List rowStateDiff = e1.RowStates.Except(e2.RowStates, new RowStateCompare()).ToList(); + Assert.That(rowStateDiff.Count, Is.EqualTo(0)); + rowStateDiff = e2.RowStates.Except(e1.RowStates, new RowStateCompare()).ToList(); + Assert.That(rowStateDiff.Count, Is.EqualTo(0)); + + Assert.That(e1.GetPivotCategoryValues(), Is.EqualTo(e2.GetPivotCategoryValues())); + foreach (var category in e1.GetPivotCategoryValues()) + { + var e1Periodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(e1, category, false); + e1Periodicity.Columns.Remove("Evaluation_ID"); + var e2Periodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(e2, category, false); + e2Periodicity.Columns.Remove("Evaluation_ID"); + var differences = + e1Periodicity.AsEnumerable().Except(e2Periodicity.AsEnumerable(), + DataRowComparer.Default); + Assert.That(differences.Any(), Is.False); + + } + } + + private class ColumnStateCompare : IEqualityComparer + { + public ColumnStateCompare() + { + } + public bool Equals(ColumnState x, ColumnState y) + { + return x.TargetProperty == y.TargetProperty && + x.PivotCategory == y.PivotCategory && + x.CountCorrect == y.CountCorrect && + x.CountMissing == y.CountMissing && + x.CountWrong == y.CountWrong && + x.CountInvalidatesRow == y.CountInvalidatesRow && + x.CountDBNull == y.CountDBNull; + } + public int GetHashCode(T obj) + { + return obj.GetHashCode(); + } + + public int GetHashCode([DisallowNull] ColumnState obj) + { + return 1; + } + } + + private class RowStateCompare : IEqualityComparer + { + public RowStateCompare() + { + } + public bool Equals(RowState x, RowState y) + { + return x.Correct == y.Correct && + x.Missing == y.Missing && + x.Wrong == y.Wrong && + x.Invalid == y.Invalid && + x.PivotCategory == y.PivotCategory; + + } + public int GetHashCode(T obj) + { + return obj.GetHashCode(); + } + + public int GetHashCode([DisallowNull] RowState obj) + { + return 1; + } + } + + + private void PerformLoad(LoadMetadata lmd, Core.Logging.LogManager logManager) + { + var dbConfig = new HICDatabaseConfiguration(lmd, null); + var projectDirectory = SetupLoadDirectory(lmd); + var job = new DataLoadJob(RepositoryLocator, "Go go go!", logManager, lmd, projectDirectory, + ThrowImmediatelyDataLoadEventListener.Quiet, dbConfig); + + new PreExecutionChecker(lmd, dbConfig).Check( + new AcceptAllCheckNotifier()); + + var loadFactory = new HICDataLoadFactory( + lmd, + dbConfig, + new HICLoadConfigurationFlags(), + CatalogueRepository, + logManager + ); + + var exe = loadFactory.Create(ThrowImmediatelyDataLoadEventListener.Quiet); + + var exitCode = exe.Run( + job, + new GracefulCancellationToken()); + + Assert.That(exitCode, Is.EqualTo(ExitCodeType.Success)); + } + + } +} diff --git a/Rdmp.Core/CommandLine/Options/DqeOptions.cs b/Rdmp.Core/CommandLine/Options/DqeOptions.cs index 8b37f68a1b..fa5dabe5f7 100644 --- a/Rdmp.Core/CommandLine/Options/DqeOptions.cs +++ b/Rdmp.Core/CommandLine/Options/DqeOptions.cs @@ -16,4 +16,10 @@ public class DqeOptions : RDMPCommandLineOptions { [Option('c', "Catalogue", HelpText = "ID of the Catalogue to run the DQE on", Required = true)] public string Catalogue { get; set; } + + [Option('d', "DataLoad", HelpText = "ID of the Data Load to run the DQE on. Adds new data to existing DQE results if they exist", Required = false)] + public string DataLoadUpdateID { get; set; } + + [Option('t', "Timeout", HelpText = "How long(in seconds) each internal SQL command should brun for before timing out")] + public int CommandTimeout { get; set; } } \ No newline at end of file diff --git a/Rdmp.Core/CommandLine/Runners/DqeRunner.cs b/Rdmp.Core/CommandLine/Runners/DqeRunner.cs index 2746c269ee..36b71dc2bc 100644 --- a/Rdmp.Core/CommandLine/Runners/DqeRunner.cs +++ b/Rdmp.Core/CommandLine/Runners/DqeRunner.cs @@ -5,6 +5,7 @@ // You should have received a copy of the GNU General Public License along with RDMP. If not, see . using System; +using Org.BouncyCastle.Security.Certificates; using Rdmp.Core.CommandLine.Options; using Rdmp.Core.Curation.Data; using Rdmp.Core.DataFlowPipeline; @@ -29,12 +30,19 @@ public override int Run(IRDMPPlatformRepositoryServiceLocator repositoryLocator, ICheckNotifier checkNotifier, GracefulCancellationToken token) { var catalogue = GetObjectFromCommandLineString(repositoryLocator, _options.Catalogue); + int? dataLoadID = null; + if (_options.DataLoadUpdateID != null) + dataLoadID = int.Parse(_options.DataLoadUpdateID); + var report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID); switch (_options.Command) { case CommandLineActivity.run: - report.GenerateReport(catalogue, listener, token.AbortToken); + if (dataLoadID is not null) + report.UpdateReport(catalogue, (int)dataLoadID, _options.CommandTimeout, listener, token.AbortToken); + else + report.GenerateReport(catalogue, listener, token.AbortToken); return 0; case CommandLineActivity.check: diff --git a/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs b/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs index 58c1cb78e1..4b8a95df48 100644 --- a/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs +++ b/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs @@ -208,4 +208,4 @@ public AggregateFilter ShallowClone(AggregateFilterContainer into) CopyShallowValuesTo(clone); return clone; } -} \ No newline at end of file +} diff --git a/Rdmp.Core/Curation/Data/ExtractionFilter.cs b/Rdmp.Core/Curation/Data/ExtractionFilter.cs index 7731a1bbf5..668dad4538 100644 --- a/Rdmp.Core/Curation/Data/ExtractionFilter.cs +++ b/Rdmp.Core/Curation/Data/ExtractionFilter.cs @@ -187,4 +187,4 @@ public override void DeleteInDatabase() base.DeleteInDatabase(); } -} \ No newline at end of file +} diff --git a/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs b/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs index b77b5d6efb..060cdbac52 100644 --- a/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs +++ b/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs @@ -11,6 +11,7 @@ using Rdmp.Core.DataLoad.Engine.Attachers; using Rdmp.Core.DataLoad.Engine.Job; using Rdmp.Core.DataLoad.Engine.LoadExecution.Components.Arguments; +using Rdmp.Core.DataLoad.Modules.Attachers; using Rdmp.Core.Repositories; using Rdmp.Core.ReusableLibraryCode.Checks; using Rdmp.Core.ReusableLibraryCode.Progress; @@ -29,6 +30,8 @@ public class AttacherRuntimeTask : RuntimeTask, IMEFRuntimeTask public AttacherRuntimeTask(IProcessTask task, RuntimeArgumentCollection args) : base(task, args) { + + //RequestsExternalDatabaseCreation //All attachers must be marked as mounting stages, and therefore we can pull out the RAW Server and Name var mountingStageArgs = args.StageSpecificArguments; if (mountingStageArgs.LoadStage != LoadStage.Mounting) diff --git a/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs b/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs index 5d1f664ce2..c9128e3270 100644 --- a/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs +++ b/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs @@ -27,6 +27,8 @@ namespace Rdmp.Core.DataLoad.Modules.Mutilators; public class DQEPostLoadRunner : IMutilateDataTables { + [DemandsInitialization("Timeout length for each query required to run the DQE update",defaultValue:50000)] + public int Timeout { get; set; } public void Check(ICheckNotifier notifier) { } @@ -73,7 +75,9 @@ public ExitCodeType Mutilate(IDataLoadJob job) DqeOptions options = new() { Catalogue = catalogue.ID.ToString(), - Command = CommandLineActivity.run + DataLoadUpdateID = job.DataLoadInfo.ID.ToString(), + Command = CommandLineActivity.run, + CommandTimeout = Timeout }; var runner = RunnerFactory.CreateRunner(new ThrowImmediatelyActivator(job.RepositoryLocator), options); runner.Run(job.RepositoryLocator, ThrowImmediatelyDataLoadEventListener.Quiet, new AcceptAllCheckNotifier(), diff --git a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs index 7ccfe6f8a0..f16b7672ed 100644 --- a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs +++ b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs @@ -10,6 +10,7 @@ using System.Text; using FAnsi; using FAnsi.Discovery; +using MongoDB.Driver; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Spontaneous; using Rdmp.Core.QueryBuilding; @@ -113,7 +114,7 @@ public void FetchData(ICheckNotifier checkNotifier) CheckResult.Success)); GetInsertData(server, database, checkNotifier); - GetUpdatetData(server, database, checkNotifier); + GetUpdatedData(server, database, checkNotifier); } catch (Exception e) { @@ -163,7 +164,7 @@ private void GetInsertData(DiscoveredServer server, DiscoveredDatabase database, } - private void GetUpdatetData(DiscoveredServer server, DiscoveredDatabase database, ICheckNotifier checkNotifier) + private void GetUpdatedData(DiscoveredServer server, DiscoveredDatabase database, ICheckNotifier checkNotifier) { const string archive = "archive"; const string zzArchive = "zzarchivezz"; @@ -191,7 +192,8 @@ private void GetUpdatetData(DiscoveredServer server, DiscoveredDatabase database --Records which appear in the archive SELECT top {{0}} {{6}}, -{{7}} +{{7}}, +{{8}} FROM {{1}} CROSS APPLY ( @@ -200,7 +202,7 @@ SELECT TOP 1 {{2}}.* WHERE {{3}} order by {syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom)} desc - ) {{8}} + ) {{9}} where {{1}}.{{4}} = {{5}}"; break; @@ -214,13 +216,14 @@ SELECT TOP 1 {{2}}.* /*Records which appear in the archive*/ SELECT {{6}}, -{{7}} +{{7}}, +{{8}} FROM {{1}} Join -{{2}} {{8}} on {whereStatement.Replace(archiveTableName, archive)} +{{2}} {{9}} on {whereStatement.Replace(archiveTableName, archive)} AND - {{8}}.{{9}} = (select max({syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom)}) from {{2}} s where {whereStatement.Replace(archiveTableName, archive).Replace(tableName, "s")}) + {{9}}.{{10}} = (select max({syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom)}) from {{2}} s where {whereStatement.Replace(archiveTableName, archive).Replace(tableName, "s")}) where {{1}}.{{4}} = {{5}} @@ -241,8 +244,9 @@ SELECT TOP 1 {{2}}.* _dataLoadRunID, //{5} GetSharedColumnsSQL(tableName), //{6} GetSharedColumnsSQLWithColumnAliasPrefix(archive, zzArchive), //{7} - archive, //{8} - syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom) + GetHICSpecialColumns(archive, zzArchive),//{8} + archive, //{9} + syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom) //{10} ); var dtComboTable = new DataTable(); @@ -253,11 +257,15 @@ SELECT TOP 1 {{2}}.* //add the columns from the combo table to both views foreach (DataColumn col in dtComboTable.Columns) + { if (!col.ColumnName.StartsWith(zzArchive, StringComparison.InvariantCultureIgnoreCase)) { Updates_New.Columns.Add(col.ColumnName, col.DataType); Updates_Replaced.Columns.Add(col.ColumnName, col.DataType); } + } + Updates_Replaced.Columns.Add(SpecialFieldNames.DataLoadRunID, typeof(int)); + Updates_Replaced.Columns.Add(SpecialFieldNames.ValidFrom, typeof(DateTime)); foreach (DataRow fromRow in dtComboTable.Rows) { @@ -272,6 +280,13 @@ SELECT TOP 1 {{2}}.* } } + private string GetHICSpecialColumns(string tableName, string columnAliasPrefix = "") + { + return $@"{tableName}.{SpecialFieldNames.DataLoadRunID} as {columnAliasPrefix}{SpecialFieldNames.DataLoadRunID}, +{tableName}.{SpecialFieldNames.ValidFrom} as {columnAliasPrefix}{SpecialFieldNames.ValidFrom} +"; + } + private string GetSharedColumnsSQLWithColumnAliasPrefix(string tableName, string columnAliasPrefix) { var sb = new StringBuilder(); diff --git a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs index c3b7003975..9c51826277 100644 --- a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs @@ -145,8 +145,7 @@ public void Commit(Evaluation evaluation, string pivotCategory, DbConnection con DatabaseCommandHelper.AddParameterWithValueToCommand("@PivotCategory", cmd, pivotCategory); cmd.ExecuteNonQuery(); } - - + IsCommitted = true; } } \ No newline at end of file diff --git a/Rdmp.Core/DataQualityEngine/Data/RowState.cs b/Rdmp.Core/DataQualityEngine/Data/RowState.cs index 1ccf05bf0b..0b600039ff 100644 --- a/Rdmp.Core/DataQualityEngine/Data/RowState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/RowState.cs @@ -25,6 +25,17 @@ public class RowState public string PivotCategory { get; private set; } + public RowState(int dataLoadRunID, int correct, int missing, int wrong, int invalid, + string validatorXml, string pivotCategory) + { + Correct = correct; + Missing = missing; + Wrong = wrong; + Invalid = invalid; + ValidatorXML = validatorXml; + DataLoadRunID = dataLoadRunID; + } + public RowState(DbDataReader r) { Correct = Convert.ToInt32(r["Correct"]); diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index d7df51dc20..4c46c07d52 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -1,4 +1,4 @@ -// Copyright (c) The University of Dundee 2018-2019 +// Copyright (c) The University of Dundee 2018-2025 // This file is part of the Research Data Management Platform (RDMP). // RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. @@ -6,13 +6,15 @@ using System; using System.Collections.Generic; +using System.Data; using System.Data.Common; -using System.Diagnostics; using System.Linq; using System.Threading; using FAnsi.Discovery; +using MongoDB.Driver; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Defaults; +using Rdmp.Core.DataLoad.Triggers; using Rdmp.Core.DataQualityEngine.Data; using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; using Rdmp.Core.Logging; @@ -42,15 +44,15 @@ public class CatalogueConstraintReport : DataQualityReport private Validator _validator; private bool _containsDataLoadID; - public static int MaximumPivotValues = 5000; - - private Dictionary byPivotRowStatesOverDataLoadRunId = new(); - private Dictionary byPivotCategoryCubesOverTime = new(); + private Dictionary byPivotRowStatesOverDataLoadRunId = []; + private Dictionary byPivotCategoryCubesOverTime = []; private IExternalDatabaseServer _loggingServer; private string _loggingTask; private LogManager _logManager; + private int? _dataLoadID; + /// /// Set this property to use an explicit DQE results store database instead of the /// default DQE database indicated by the @@ -89,10 +91,8 @@ private void SetupLogging(ICatalogueRepository repository) } } - private bool haveComplainedAboutNullCategories; - public override void GenerateReport(ICatalogue c, IDataLoadEventListener listener, - CancellationToken cancellationToken) + CancellationToken cancellationToken) { SetupLogging(c.CatalogueRepository); @@ -105,18 +105,15 @@ public override void GenerateReport(ICatalogue c, IDataLoadEventListener listene { _catalogue = c; var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); - - byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); - byPivotRowStatesOverDataLoadRunId.Add("ALL", new DQEStateOverDataLoadRunId("ALL")); - + DbDataReader r; Check(new FromDataLoadEventListenerToCheckNotifier(forker)); - - var sw = Stopwatch.StartNew(); using (var con = _server.GetConnection()) { con.Open(); - - var cmd = _server.GetCommand(_queryBuilder.SQL, con); + var qb = _queryBuilder; + if (_dataLoadID is not null) + qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, con); cmd.CommandTimeout = 500000; var t = cmd.ExecuteReaderAsync(cancellationToken); @@ -125,90 +122,16 @@ public override void GenerateReport(ICatalogue c, IDataLoadEventListener listene if (cancellationToken.IsCancellationRequested) throw new OperationCanceledException("User cancelled DQE while fetching data"); - var r = t.Result; - - var progress = 0; - - while (r.Read()) - { - cancellationToken.ThrowIfCancellationRequested(); - - progress++; - var dataLoadRunIDOfCurrentRecord = 0; - //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) - - //if the DataReader is likely to have a data load run ID column - if (_containsDataLoadID) - { - //get data load run id - var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); - - //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) - if (runID != null) - dataLoadRunIDOfCurrentRecord = (int)runID; - } - - string pivotValue = null; - - //if the user has a pivot category configured - if (_pivotCategory != null) - { - pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); - - if (!haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) - { - forker.OnNotify(this, - new NotifyEventArgs(ProgressEventType.Warning, - $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); - haveComplainedAboutNullCategories = true; - pivotValue = null; - } - } - - //always increase the "ALL" category - ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, - byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); - - //if there is a value in the current record for the pivot column - if (pivotValue != null) - { - //if it is a novel - if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) - { - //we will need to expand the dictionaries - if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) - throw new OverflowException( - $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); - - //expand both the time periodicity and the state results - byPivotRowStatesOverDataLoadRunId.Add(pivotValue, - new DQEStateOverDataLoadRunId(pivotValue)); - periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); - byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); - } - - //now we are sure that the dictionaries have the category field we can increment it - ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, -periodicityCubesOverTime, byPivotRowStatesOverDataLoadRunId[pivotValue]); - } - - if (progress % 5000 == 0) - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); - } + r = t.Result; + var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, r); + reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); - //final value - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); - con.Close(); + byPivotCategoryCubesOverTime = reportBuilder.GetByPivotCategoryCubesOverTime(); + byPivotRowStatesOverDataLoadRunId = reportBuilder.GetByPivotRowStatesOverDataLoadRunId(); } - sw.Stop(); - foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) - state.CalculateFinalValues(); + //now commit results using (var con = dqeRepository.BeginNewTransactedConnection()) @@ -251,24 +174,389 @@ e is OperationCanceledException } } - private bool _haveComplainedAboutTrailingWhitespaces; + //Notes + // this is technically more efficient than a full DQE, but ot's pretty rubbish for categories with updates as we recalculate for the whole category + //may be worth thinking about how we can keep existing records and modify/add to them depending on what's goin on + - private string GetStringValueForPivotField(object o, IDataLoadEventListener listener) + public void UpdateReport(ICatalogue c, int dataLoadID, int? commandTimeout, IDataLoadEventListener listener, + CancellationToken cancellationToken) { - if (o == null || o == DBNull.Value) - return null; + _dataLoadID = dataLoadID; + SetupLogging(c.CatalogueRepository); - var stringValue = o.ToString(); - var trimmedValue = stringValue.Trim(); + var toDatabaseLogger = new ToLoggingDatabaseDataLoadEventListener(this, _logManager, _loggingTask, + $"DQE evaluation of {c}"); - if (!_haveComplainedAboutTrailingWhitespaces && stringValue != trimmedValue) + var forker = new ForkDataLoadEventListener(listener, toDatabaseLogger); + try { - listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, - $"Found trailing/leading whitespace in value in Pivot field, this will be trimmed off:'{o}'")); - _haveComplainedAboutTrailingWhitespaces = true; - } + _catalogue = c; + var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); + //make report for new data + DataTable rDT = new(); + Check(new FromDataLoadEventListenerToCheckNotifier(forker)); - return trimmedValue; + using (var con = _server.GetConnection()) + { + con.Open(); + var qb = _queryBuilder; + if (_dataLoadID is not null) + qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, con); + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; + var adapter = _server.GetDataAdapter(cmd); + rDT.BeginLoadData(); + adapter.Fill(rDT); + rDT.EndLoadData(); + con.Close(); + } + var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, rDT); + reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var newByPivotRowStatesOverDataLoadRunId = reportBuilder.GetByPivotRowStatesOverDataLoadRunId(); + + var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); + + var incomingPivotCategories = rDT.AsEnumerable().Select(r => r[pivotColumn].ToString()).ToList().Distinct(); + + using (var con = dqeRepository.BeginNewTransactedConnection()) + { + var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); + var previousColumnStates = previousEvaluation.ColumnStates; + var previousRowSates = previousEvaluation.RowStates; + var previousCategories = previousEvaluation.GetPivotCategoryValues().Where(c => c != "ALL"); + + var evaluation = new Evaluation(dqeRepository, _catalogue); + + //new pivotCategories coming in + var newIncomingPivotCategories = incomingPivotCategories.Where(c => !previousCategories.Contains(c)); + List ColumnStates = []; + + + var pivotColumnInfo = _catalogue.CatalogueItems.Where(ci => ci.Name == _pivotCategory).FirstOrDefault(); + if (pivotColumnInfo is null) throw new Exception("Can't find column infor for pivot category"); + var tableInfo = pivotColumnInfo.ColumnInfo.TableInfo; + + var dataDiffFetcher = new DiffDatabaseDataFetcher(2147483647, tableInfo, (int)_dataLoadID, commandTimeout != null ? (int)commandTimeout : 30); + dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); + //pivot categories that have been replaces 100%? + var replacedPivotCategories = previousCategories.Where(c => + { + if (incomingPivotCategories.Contains(c)) return false;//not a total replacement + var replacedCount = dataDiffFetcher.Updates_Replaced.AsEnumerable().Where(r => r[_pivotCategory].ToString() == c).Count(); + var previousRowState = previousRowSates.Where(rs => rs.PivotCategory == c).FirstOrDefault(); + if (previousRowState is null) return false; //did not exist before + var previousEvaluationTotal = previousRowState.Correct + previousRowState.Missing + previousRowState.Wrong + previousRowState.Invalid; + return replacedCount == previousEvaluationTotal; + }); + + // existing pivot categories coming in + var existingIncomingPivotCategories = incomingPivotCategories.Where(c => previousCategories.Contains(c) && !replacedPivotCategories.Contains(c) && c != "ALL"); + + + //* Row States *// + //unchanges categories + foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) + { + //copy row states that have not changes + evaluation.AddRowState(previousRowState.DataLoadRunID, previousRowState.Correct, previousRowState.Missing, previousRowState.Wrong, previousRowState.Invalid, previousRowState.ValidatorXML, previousRowState.PivotCategory, con.Connection, con.Transaction); + } + //new categories + foreach (var newCategory in newIncomingPivotCategories) + { + newByPivotRowStatesOverDataLoadRunId.TryGetValue(newCategory, out DQEStateOverDataLoadRunId incomingState); + incomingState.RowsPassingValidationByDataLoadRunID.TryGetValue((int)_dataLoadID, out int correct); + incomingState.WorstConsequencesByDataLoadRunID.TryGetValue((int)_dataLoadID, out Dictionary results); + results.TryGetValue(Consequence.Missing, out int mising); + results.TryGetValue(Consequence.Wrong, out int wrong); + results.TryGetValue(Consequence.InvalidatesRow, out int invalidatesRow); + evaluation.AddRowState((int)_dataLoadID, correct, mising, wrong, invalidatesRow, _catalogue.ValidatorXML, newCategory, con.Connection, con.Transaction); + + incomingState.AllColumnStates.TryGetValue((int)_dataLoadID, out ColumnState[] columnStates); + foreach (var columnState in columnStates) + { + columnState.Commit(evaluation, newCategory, con.Connection, con.Transaction); + ColumnStates.Add(columnState); + } + } + //Updates + if (existingIncomingPivotCategories.Any()) + { + //existing row states with new entries + var updatedRowsDataTable = new DataTable(); + var qb = new QueryBuilder(null, ""); + + using (var updateCon = _server.GetConnection()) + { + updateCon.Open(); + qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); + qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, updateCon); + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; + var adapter = _server.GetDataAdapter(cmd); + updatedRowsDataTable.BeginLoadData(); + adapter.Fill(updatedRowsDataTable); + updatedRowsDataTable.EndLoadData(); + updateCon.Close(); + } + var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); + updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var updatedByPivotRowStatesOverDataLoadRunId = updatedRowsReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); + + foreach (var updatedCategory in existingIncomingPivotCategories) + { + updatedByPivotRowStatesOverDataLoadRunId.TryGetValue(updatedCategory, out DQEStateOverDataLoadRunId incomingState); + foreach (var loadId in incomingState.RowsPassingValidationByDataLoadRunID.Keys) + { + incomingState.RowsPassingValidationByDataLoadRunID.TryGetValue(loadId, out int _correct); + incomingState.WorstConsequencesByDataLoadRunID.TryGetValue(loadId, out Dictionary results); + results.TryGetValue(Consequence.Missing, out int _missing); + results.TryGetValue(Consequence.Wrong, out int _wrong); + results.TryGetValue(Consequence.InvalidatesRow, out int _invalidatesRow); + evaluation.AddRowState(loadId, _correct, _missing, _wrong, _invalidatesRow, _catalogue.ValidatorXML, updatedCategory, con.Connection, con.Transaction); + + incomingState.AllColumnStates.TryGetValue(loadId, out ColumnState[] columnStates); + foreach (var columnState in columnStates) + { + columnState.Commit(evaluation, updatedCategory, con.Connection, con.Transaction); + ColumnStates.Add(columnState); + } + } + } + } + List AllStates = []; + foreach (var rowState in evaluation.RowStates) + { + if (!AllStates.Any(state => state.DataLoadRunID == rowState.DataLoadRunID)) + { + AllStates.Add(new RowState(rowState.DataLoadRunID, rowState.Correct, rowState.Missing, rowState.Wrong, rowState.Invalid, _catalogue.ValidatorXML, "ALL")); + } + else + { + var current = AllStates.Where(state => state.DataLoadRunID == rowState.DataLoadRunID).FirstOrDefault(); + if (current is not null) + { + var newState = new RowState(rowState.DataLoadRunID, rowState.Correct + current.Correct, rowState.Missing + current.Missing, rowState.Wrong + current.Wrong, rowState.Invalid + current.Invalid, _catalogue.ValidatorXML, "ALL"); + AllStates = AllStates.Where(state => state.DataLoadRunID != rowState.DataLoadRunID).ToList(); + AllStates.Add(newState); + } + } + } + foreach (var state in AllStates) + { + evaluation.AddRowState(state.DataLoadRunID, state.Correct, state.Missing, state.Wrong, state.Invalid, _catalogue.ValidatorXML, "ALL", con.Connection, con.Transaction); + + } + //* Column States *// + //unchanged + foreach (var previousColumnState in previousColumnStates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) + { + var cm = new ColumnState(previousColumnState.TargetProperty, previousColumnState.DataLoadRunID, previousColumnState.ItemValidatorXML) + { + CountCorrect = previousColumnState.CountCorrect, + CountMissing = previousColumnState.CountMissing, + CountWrong = previousColumnState.CountWrong, + CountInvalidatesRow = previousColumnState.CountInvalidatesRow, + CountDBNull = previousColumnState.CountDBNull + }; + cm.Commit(evaluation, previousColumnState.PivotCategory, con.Connection, con.Transaction); + ColumnStates.Add(cm); + } + List AllColumns = []; + foreach (var columnState in ColumnStates) + { + if (!AllColumns.Any(state => state.DataLoadRunID == columnState.DataLoadRunID && state.TargetProperty == columnState.TargetProperty && state.PivotCategory == columnState.PivotCategory)) + { + var cm = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) + { + CountCorrect = columnState.CountCorrect, + CountMissing = columnState.CountMissing, + CountWrong = columnState.CountWrong, + CountInvalidatesRow = columnState.CountInvalidatesRow, + CountDBNull = columnState.CountDBNull + }; + AllColumns.Add(cm); + } + else + { + var index = AllColumns.FindIndex(state => state.DataLoadRunID == columnState.DataLoadRunID && state.TargetProperty == columnState.TargetProperty && state.PivotCategory == columnState.PivotCategory); + if (index != -1) + { + AllColumns[index].CountCorrect += columnState.CountCorrect; + AllColumns[index].CountMissing += columnState.CountMissing; + AllColumns[index].CountWrong += columnState.CountWrong; + AllColumns[index].CountInvalidatesRow += columnState.CountInvalidatesRow; + AllColumns[index].CountDBNull += columnState.CountDBNull; + } + } + } + foreach (var column in AllColumns) + { + column.Commit(evaluation, "ALL", con.Connection, con.Transaction); + } + + //* Periodicity States *// + + //Unchanged + Dictionary newByPivotCategoryCubesOverTime = [];//reset + + var unchangedPivotCategories = previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory)).Select(rs => rs.PivotCategory).Distinct(); + newByPivotCategoryCubesOverTime.TryGetValue("ALL", out var value); + if (value is null) + { + newByPivotCategoryCubesOverTime["ALL"] = new PeriodicityCubesOverTime("ALL"); + } + foreach (var pivotCategory in unchangedPivotCategories) + { + var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, pivotCategory, false); + newByPivotCategoryCubesOverTime.TryGetValue(pivotCategory, out value); + if (value is null) + { + newByPivotCategoryCubesOverTime[pivotCategory] = new PeriodicityCubesOverTime(pivotCategory); + } + + foreach (var row in previousPeriodicity.AsEnumerable()) + { + var countOfRecords = int.Parse(row[2].ToString()); + for (var i = 0; i < countOfRecords; i++) + { + + Enum.TryParse(row[3].ToString(), out Consequence consequence); + var date = DateTime.Parse(row[1].ToString()); + newByPivotCategoryCubesOverTime[pivotCategory].IncrementHyperCube(date.Year, date.Month, consequence); + + newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(date.Year, date.Month, consequence); + } + } + } + //replacements + if (existingIncomingPivotCategories.Any()) + { + var updatedRowsDataTable = new DataTable(); + var qb = new QueryBuilder(null, ""); + + using (var updateCon = _server.GetConnection()) + { + updateCon.Open(); + qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); + qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, updateCon); + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; + var adapter = _server.GetDataAdapter(cmd); + updatedRowsDataTable.BeginLoadData(); + adapter.Fill(updatedRowsDataTable); + updatedRowsDataTable.EndLoadData(); + updateCon.Close(); + } + var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); + updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var cc = updatedRowsReportBuilder.GetByPivotCategoryCubesOverTime(); + foreach (var category in cc.Keys) + { + var hyperCube = cc[category].GetHyperCube(); + foreach (var year in hyperCube.Keys) + { + var periodicityCubes = hyperCube[year]; + foreach (var month in periodicityCubes.Keys) + { + var cube = periodicityCubes[month]; + foreach (var consequence in Enum.GetValues().Cast().ToList()) + { + var state = cube.GetStateForConsequence(consequence); + for (var i = 0; i < state.CountOfRecords; i++) + { + newByPivotCategoryCubesOverTime.TryGetValue(category, out value); + if (value is null) + { + newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + } + newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, consequence); + } + + } + } + + } + } + } + if (newIncomingPivotCategories.Any()) + { + var updatedRowsDataTable = new DataTable(); + var qb = new QueryBuilder(null, ""); + + using (var updateCon = _server.GetConnection()) + { + updateCon.Open(); + qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); + qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', newIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, updateCon); + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; + var adapter = _server.GetDataAdapter(cmd); + updatedRowsDataTable.BeginLoadData(); + adapter.Fill(updatedRowsDataTable); + updatedRowsDataTable.EndLoadData(); + updateCon.Close(); + } + var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); + updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var cc = updatedRowsReportBuilder.GetByPivotCategoryCubesOverTime(); + foreach (var category in cc.Keys) + { + var hyperCube = cc[category].GetHyperCube(); + foreach (var year in hyperCube.Keys) + { + var periodicityCubes = hyperCube[year]; + foreach (var month in periodicityCubes.Keys) + { + var cube = periodicityCubes[month]; + foreach (var consequence in Enum.GetValues(typeof(Consequence)).Cast().ToList()) + { + var state = cube.GetStateForConsequence(consequence); + for (var i = 0; i < state.CountOfRecords; i++) + { + newByPivotCategoryCubesOverTime.TryGetValue(category, out value); + if (value is null) + { + newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + } + newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, consequence); + } + + } + } + + } + + } + } + //add all the new stuff + foreach (var v in newByPivotCategoryCubesOverTime.Values) + { + v.CommitToDatabase(evaluation); + } + dqeRepository.EndTransactedConnection(true); + + } + + forker.OnNotify(this, + new NotifyEventArgs(ProgressEventType.Information, + "CatalogueConstraintReport completed successfully and committed results to DQE server")); + } + catch (Exception e) + { + forker.OnNotify(this, + e is OperationCanceledException + ? new NotifyEventArgs(ProgressEventType.Warning, "DQE Execution Cancelled", e) + : new NotifyEventArgs(ProgressEventType.Error, "Fatal Crash", e)); + } + finally + { + toDatabaseLogger.FinalizeTableLoadInfos(); + } } private string _timePeriodicityField; @@ -529,54 +817,4 @@ private void SetupAdditionalValidationRules(ICheckNotifier notifier) } } - private void ProcessRecord(DQERepository dqeRepository, int dataLoadRunIDOfCurrentRecord, DbDataReader r, - PeriodicityCubesOverTime periodicity, DQEStateOverDataLoadRunId states) - { - //make sure all the results dictionaries - states.AddKeyToDictionaries(dataLoadRunIDOfCurrentRecord, _validator, _queryBuilder); - - //ask the validator to validate! - _validator.ValidateVerboseAdditive( - r, //validate the data reader - states.ColumnValidationFailuresByDataLoadRunID[ - dataLoadRunIDOfCurrentRecord], //additively adjust the validation failures dictionary - out var worstConsequence); //and tell us what the worst consequence in the row was - - - //increment the time periodicity hypercube! - if (_timePeriodicityField != null) - { - DateTime? dt; - - try - { - dt = dqeRepository.ObjectToNullableDateTime(r[_timePeriodicityField]); - } - catch (InvalidCastException e) - { - throw new Exception( - $"Found value {r[_timePeriodicityField]} of type {r[_timePeriodicityField].GetType().Name} in your time periodicity field which was not a valid date time, make sure your time periodicity field is a datetime datatype", - e); - } - - if (dt != null) - periodicity.IncrementHyperCube(dt.Value.Year, dt.Value.Month, worstConsequence); - } - - //now we need to update everything we know about all the columns - foreach (var state in states.AllColumnStates[dataLoadRunIDOfCurrentRecord]) - { - //start out by assuming everything is dandy - state.CountCorrect++; - - if (r[state.TargetProperty] == DBNull.Value) - state.CountDBNull++; - } - - //update row level dictionaries - if (worstConsequence == null) - states.RowsPassingValidationByDataLoadRunID[dataLoadRunIDOfCurrentRecord]++; - else - states.WorstConsequencesByDataLoadRunID[dataLoadRunIDOfCurrentRecord][(Consequence)worstConsequence]++; - } } \ No newline at end of file diff --git a/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs b/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs index 7046a403f3..0a701dbce6 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs @@ -50,6 +50,11 @@ public void InitializeDictionaries() RowsPassingValidationByDataLoadRunID = new Dictionary(); } + public string GetPiviotCategory() + { + return _pivotCategory; + } + public void AddKeyToDictionaries(int dataLoadRunID, Validator validator, QueryBuilder queryBuilder) { //ensure keys exit (if it is a novel data load run ID then we will add it to the dictionaries diff --git a/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs b/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs index 730234ed2b..e22476cc58 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs @@ -31,6 +31,16 @@ public PeriodicityCubesOverTime(string pivotCategory) _pivotCategory = pivotCategory; } + public Dictionary> GetHyperCube() + { + return hyperCube; + } + + public string GetPivotCategory() + { + return _pivotCategory; + } + public static void PeriodicityCube() { } diff --git a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs new file mode 100644 index 0000000000..d63016b12a --- /dev/null +++ b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs @@ -0,0 +1,229 @@ +// Copyright (c) The University of Dundee 2024-2024 +// This file is part of the Research Data Management Platform (RDMP). +// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// You should have received a copy of the GNU General Public License along with RDMP. If not, see . +using Rdmp.Core.Curation.Data; +using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; +using Rdmp.Core.QueryBuilding; +using Rdmp.Core.Repositories; +using Rdmp.Core.ReusableLibraryCode.Progress; +using Rdmp.Core.Validation; +using Rdmp.Core.Validation.Constraints; +using System; +using System.Collections.Generic; +using System.Data; +using System.Data.Common; +using System.Diagnostics; +using System.Linq; +using System.Threading; + +namespace Rdmp.Core.DataQualityEngine.Reports; + +/// +/// Class used to build cataloge constraint reports +/// +public class ReportBuilder +{ + private readonly string _dataLoadRunFieldName; + + //where the data is located + private readonly QueryBuilder _queryBuilder; + private readonly Validator _validator; + private readonly bool _containsDataLoadID; + + private static readonly int MaximumPivotValues = 5000; + + private readonly Dictionary byPivotRowStatesOverDataLoadRunId = []; + private readonly Dictionary byPivotCategoryCubesOverTime = []; + + private readonly string _timePeriodicityField; + private readonly string _pivotCategory; + private readonly ICatalogue _catalogue; + private bool _haveComplainedAboutNullCategories; + private bool _haveComplainedAboutTrailingWhitespaces; + + private readonly DataTable _resultsDT = new(); + public ReportBuilder(ICatalogue catalogue, Validator validator, QueryBuilder queryBuilder, string dataLoadRunFieldName, bool containsDataLoadID, string timePeriodicityField, string pivotCategory, DbDataReader results) + { + _catalogue = catalogue; + _validator = validator; + _queryBuilder = queryBuilder; + _dataLoadRunFieldName = dataLoadRunFieldName; + _containsDataLoadID = containsDataLoadID; + _timePeriodicityField = timePeriodicityField; + _pivotCategory = pivotCategory; + _resultsDT.Load(results); + } + public ReportBuilder(ICatalogue catalogue, Validator validator, QueryBuilder queryBuilder, string dataLoadRunFieldName, bool containsDataLoadID, string timePeriodicityField, string pivotCategory, DataTable results) + { + _catalogue = catalogue; + _validator = validator; + _queryBuilder = queryBuilder; + _dataLoadRunFieldName = dataLoadRunFieldName; + _containsDataLoadID = containsDataLoadID; + _timePeriodicityField = timePeriodicityField; + _pivotCategory = pivotCategory; + _resultsDT = results; + } + + public Dictionary GetByPivotRowStatesOverDataLoadRunId() => byPivotRowStatesOverDataLoadRunId; + public Dictionary GetByPivotCategoryCubesOverTime() => byPivotCategoryCubesOverTime; + + public void BuildReportInternals( + CancellationToken cancellationToken, ForkDataLoadEventListener forker, DQERepository dqeRepository) + { + byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); + byPivotRowStatesOverDataLoadRunId.Add("ALL", new DQEStateOverDataLoadRunId("ALL")); + + var sw = Stopwatch.StartNew(); + var progress = 0; + + foreach (var r in _resultsDT.AsEnumerable()) + { + cancellationToken.ThrowIfCancellationRequested(); + + progress++; + var dataLoadRunIDOfCurrentRecord = 0; + //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) + + //if the DataReader is likely to have a data load run ID column + if (_containsDataLoadID) + { + //get data load run id + var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); + + //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) + if (runID != null) + dataLoadRunIDOfCurrentRecord = (int)runID; + } + + string pivotValue = null; + + //if the user has a pivot category configured + if (_pivotCategory != null) + { + pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); + + if (!_haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) + { + forker.OnNotify(this, + new NotifyEventArgs(ProgressEventType.Warning, + $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); + _haveComplainedAboutNullCategories = true; + pivotValue = null; + } + } + + //always increase the "ALL" category + ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, + byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); + + //if there is a value in the current record for the pivot column + if (pivotValue != null) + { + //if it is a novel + if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) + { + //we will need to expand the dictionaries + if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) + throw new OverflowException( + $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); + + //expand both the time periodicity and the state results + byPivotRowStatesOverDataLoadRunId.Add(pivotValue, + new DQEStateOverDataLoadRunId(pivotValue)); + periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); + byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); + } + + //now we are sure that the dictionaries have the category field we can increment it + ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, +periodicityCubesOverTime, byPivotRowStatesOverDataLoadRunId[pivotValue]); + } + + if (progress % 5000 == 0) + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); + } + + //final value + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); + + + sw.Stop(); + foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) + state.CalculateFinalValues(); + } + + private string GetStringValueForPivotField(object o, IDataLoadEventListener listener) + { + if (o == null || o == DBNull.Value) + return null; + + var stringValue = o.ToString(); + var trimmedValue = stringValue.Trim(); + + if (!_haveComplainedAboutTrailingWhitespaces && stringValue != trimmedValue) + { + listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, + $"Found trailing/leading whitespace in value in Pivot field, this will be trimmed off:'{o}'")); + _haveComplainedAboutTrailingWhitespaces = true; + } + + return trimmedValue; + } + private void ProcessRecord(DQERepository dqeRepository, int dataLoadRunIDOfCurrentRecord, DataRow r, + PeriodicityCubesOverTime periodicity, DQEStateOverDataLoadRunId states) + { + //make sure all the results dictionaries + states.AddKeyToDictionaries(dataLoadRunIDOfCurrentRecord, _validator, _queryBuilder); + + //ask the validator to validate! + _validator.ValidateVerboseAdditive( + r, //validate the data reader + states.ColumnValidationFailuresByDataLoadRunID[ + dataLoadRunIDOfCurrentRecord], //additively adjust the validation failures dictionary + out var worstConsequence); //and tell us what the worst consequence in the row was + + + //increment the time periodicity hypercube! + if (_timePeriodicityField != null) + { + DateTime? dt; + + try + { + dt = dqeRepository.ObjectToNullableDateTime(r[_timePeriodicityField]); + } + catch (InvalidCastException e) + { + throw new Exception( + $"Found value {r[_timePeriodicityField]} of type {r[_timePeriodicityField].GetType().Name} in your time periodicity field which was not a valid date time, make sure your time periodicity field is a datetime datatype", + e); + } + + if (dt != null) + periodicity.IncrementHyperCube(dt.Value.Year, dt.Value.Month, worstConsequence); + } + + //now we need to update everything we know about all the columns + foreach (var state in states.AllColumnStates[dataLoadRunIDOfCurrentRecord]) + { + //start out by assuming everything is dandy + state.CountCorrect++; + + if (r[state.TargetProperty] == DBNull.Value) + state.CountDBNull++; + } + + //update row level dictionaries + if (worstConsequence == null) + states.RowsPassingValidationByDataLoadRunID[dataLoadRunIDOfCurrentRecord]++; + else + states.WorstConsequencesByDataLoadRunID[dataLoadRunIDOfCurrentRecord][(Consequence)worstConsequence]++; + } +}