From 2a516bf15bc863ffbf3b5f92b0d6952b068e9911 Mon Sep 17 00:00:00 2001 From: James Friel Date: Wed, 25 Sep 2024 14:05:03 +0100 Subject: [PATCH 01/35] add changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c4ac89757..172ecdb1dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [8.4.0] - Unreleased + ## [8.3.1] - Unreleased - Improve Performance of regenerating problems with child providers From 53596b0457088e55eac543d8f70b5fe4a5849e1a Mon Sep 17 00:00:00 2001 From: James Friel Date: Thu, 26 Sep 2024 10:21:29 +0100 Subject: [PATCH 02/35] Bugfix/rdmp 253 filter ordering (#2007) * add start * partial * tidy up code * filter updates * update db migrations * update correct db * update changelog * add base creation sql * fix create * tidy up --- CHANGELOG.md | 3 + .../Data/Aggregation/AggregateFilter.cs | 6 +- Rdmp.Core/Curation/Data/ConcreteFilter.cs | 5 +- Rdmp.Core/Curation/Data/ExtractionFilter.cs | 6 +- .../SpontaneouslyInventedFilter.cs | 5 +- .../Data/DeployedExtractionFilter.cs | 6 +- .../CreateCatalogue.sql | 2 + .../up/087_AddAggregateFilterOrdering.sql | 13 ++++ .../CreateDataExportManager.sql | Bin 207702 -> 207772 bytes .../up/026_AddFilterOrder.sql | 8 +++ Rdmp.Core/Rdmp.Core.csproj | 4 ++ .../ExecuteCommandReorderFilter.cs | 62 ++++++++++++++++++ ...poseExecutionWhenTargetIsConcreteFilter.cs | 18 +++-- SharedAssemblyInfo.cs | 6 +- 14 files changed, 131 insertions(+), 13 deletions(-) create mode 100644 Rdmp.Core/Databases/CatalogueDatabase/up/087_AddAggregateFilterOrdering.sql create mode 100644 Rdmp.Core/Databases/DataExportDatabase/up/026_AddFilterOrder.sql create mode 100644 Rdmp.UI/CommandExecution/AtomicCommands/ExecuteCommandReorderFilter.cs diff --git a/CHANGELOG.md b/CHANGELOG.md index 172ecdb1dd..3bd8a30350 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ + # Changelog All notable changes to this project will be documented in this file. @@ -7,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [8.4.0] - Unreleased +- Add Ordering to Filters + ## [8.3.1] - Unreleased - Improve Performance of regenerating problems with child providers diff --git a/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs b/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs index 7b23404346..307a7ef8aa 100644 --- a/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs +++ b/Rdmp.Core/Curation/Data/Aggregation/AggregateFilter.cs @@ -1,4 +1,4 @@ -// Copyright (c) The University of Dundee 2018-2019 +// Copyright (c) The University of Dundee 2018-2024 // This file is part of the Research Data Management Platform (RDMP). // RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. @@ -35,6 +35,7 @@ public class AggregateFilter : ConcreteFilter, IDisableable private int? _clonedFromExtractionFilterID; private int? _associatedColumnInfoID; private bool _isDisabled; + private int _order; /// public override int? ClonedFromExtractionFilter_ID @@ -90,6 +91,8 @@ public IEnumerable AggregateFilterParameters ? Repository.GetObjectByID(FilterContainer_ID.Value) : null; + public override int Order { get => _order; set => SetField(ref _order, value); } + #endregion public AggregateFilter() @@ -121,6 +124,7 @@ internal AggregateFilter(ICatalogueRepository repository, DbDataReader r) : base Name = r["Name"] as string; IsMandatory = (bool)r["IsMandatory"]; ClonedFromExtractionFilter_ID = ObjectToNullableInt(r["ClonedFromExtractionFilter_ID"]); + Order = int.Parse(r["Order"].ToString()); var associatedColumnInfo_ID = r["AssociatedColumnInfo_ID"]; if (associatedColumnInfo_ID != DBNull.Value) diff --git a/Rdmp.Core/Curation/Data/ConcreteFilter.cs b/Rdmp.Core/Curation/Data/ConcreteFilter.cs index c29ebbdeca..b8fe1f49de 100644 --- a/Rdmp.Core/Curation/Data/ConcreteFilter.cs +++ b/Rdmp.Core/Curation/Data/ConcreteFilter.cs @@ -1,4 +1,4 @@ -// Copyright (c) The University of Dundee 2018-2019 +// Copyright (c) The University of Dundee 2018-2024 // This file is part of the Research Data Management Platform (RDMP). // RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. @@ -25,7 +25,7 @@ namespace Rdmp.Core.Curation.Data; /// ConcreteFilter is used to provide UI editing of an IFilter without having to add persistence / DatabaseEntity logic to IFilter (which would break /// SpontaneouslyInventedFilters) /// -public abstract class ConcreteFilter : DatabaseEntity, IFilter, ICheckable +public abstract class ConcreteFilter : DatabaseEntity, IFilter, ICheckable, IOrderable { /// protected ConcreteFilter(IRepository repository, DbDataReader r) : base(repository, r) @@ -100,6 +100,7 @@ public bool IsMandatory /// [NoMappingToDatabase] public abstract IContainer FilterContainer { get; } + public abstract int Order { get; set; } #endregion diff --git a/Rdmp.Core/Curation/Data/ExtractionFilter.cs b/Rdmp.Core/Curation/Data/ExtractionFilter.cs index f6ce6f8d4c..febd12b629 100644 --- a/Rdmp.Core/Curation/Data/ExtractionFilter.cs +++ b/Rdmp.Core/Curation/Data/ExtractionFilter.cs @@ -1,4 +1,4 @@ -// Copyright (c) The University of Dundee 2018-2019 +// Copyright (c) The University of Dundee 2018-2024 // This file is part of the Research Data Management Platform (RDMP). // RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. @@ -40,6 +40,7 @@ public class ExtractionFilter : ConcreteFilter, IHasDependencies, IInjectKnown _knownExtractionFilterParameterSets; + private int _order; /// /// The column in the which is best/most associated with this filter. A filter can query any column in any of the table(s) under @@ -133,6 +134,8 @@ internal ExtractionFilter(ICatalogueRepository repository, DbDataReader r) Description = r["Description"] as string; Name = r["Name"] as string; IsMandatory = (bool)r["IsMandatory"]; + Order = int.Parse(r["Order"].ToString()); + ClearAllInjections(); } @@ -154,6 +157,7 @@ public override int? ClonedFromExtractionFilter_ID set => throw new NotSupportedException( "ClonedFromExtractionFilter_ID is only supported on lower level filters e.g. DeployedExtractionFilter and AggregateFilter"); } + public override int Order { get => _order; set => SetField(ref _order,value); } /// public IHasDependencies[] GetObjectsThisDependsOn() diff --git a/Rdmp.Core/Curation/Data/Spontaneous/SpontaneouslyInventedFilter.cs b/Rdmp.Core/Curation/Data/Spontaneous/SpontaneouslyInventedFilter.cs index 8bbadc4f85..dd8458426f 100644 --- a/Rdmp.Core/Curation/Data/Spontaneous/SpontaneouslyInventedFilter.cs +++ b/Rdmp.Core/Curation/Data/Spontaneous/SpontaneouslyInventedFilter.cs @@ -1,4 +1,4 @@ -// Copyright (c) The University of Dundee 2018-2019 +// Copyright (c) The University of Dundee 2018-2024 // This file is part of the Research Data Management Platform (RDMP). // RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. @@ -22,6 +22,7 @@ public class SpontaneouslyInventedFilter : ConcreteFilter { private readonly MemoryCatalogueRepository _repo; private readonly ISqlParameter[] _filterParametersIfAny; + private int _order =0; /// /// Creates a new temporary (unsaveable) filter in the given memory @@ -68,6 +69,8 @@ public SpontaneouslyInventedFilter(MemoryCatalogueRepository repo, IFilter copyF ? _repo.GetObjectByID(FilterContainer_ID.Value) : null; + public override int Order { get => _order; set => SetField(ref _order, value); } + public override ColumnInfo GetColumnInfoIfExists() => null; public override IFilterFactory GetFilterFactory() => null; diff --git a/Rdmp.Core/DataExport/Data/DeployedExtractionFilter.cs b/Rdmp.Core/DataExport/Data/DeployedExtractionFilter.cs index d1e7960ade..122eece307 100644 --- a/Rdmp.Core/DataExport/Data/DeployedExtractionFilter.cs +++ b/Rdmp.Core/DataExport/Data/DeployedExtractionFilter.cs @@ -1,4 +1,4 @@ -// Copyright (c) The University of Dundee 2018-2019 +// Copyright (c) The University of Dundee 2018-2024 // This file is part of the Research Data Management Platform (RDMP). // RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. @@ -36,6 +36,7 @@ public class DeployedExtractionFilter : ConcreteFilter private int? _clonedFromExtractionFilterID; private int? _filterContainerID; + private int _order; /// public override int? ClonedFromExtractionFilter_ID @@ -69,6 +70,8 @@ public override int? FilterContainer_ID ? Repository.GetObjectByID(FilterContainer_ID.Value) : null; + public override int Order { get => _order; set => SetField(ref _order, value); } + #endregion /// @@ -138,6 +141,7 @@ internal DeployedExtractionFilter(IDataExportRepository repository, DbDataReader FilterContainer_ID = null; ClonedFromExtractionFilter_ID = ObjectToNullableInt(r["ClonedFromExtractionFilter_ID"]); + Order = int.Parse(r["Order"].ToString()); } /// diff --git a/Rdmp.Core/Databases/CatalogueDatabase/runAfterCreateDatabase/CreateCatalogue.sql b/Rdmp.Core/Databases/CatalogueDatabase/runAfterCreateDatabase/CreateCatalogue.sql index e794b1e35c..418fcf7e9d 100644 --- a/Rdmp.Core/Databases/CatalogueDatabase/runAfterCreateDatabase/CreateCatalogue.sql +++ b/Rdmp.Core/Databases/CatalogueDatabase/runAfterCreateDatabase/CreateCatalogue.sql @@ -141,6 +141,7 @@ CREATE TABLE [dbo].[AggregateFilter]( [AssociatedColumnInfo_ID] [int] NULL, [ID] [int] IDENTITY(1,1) NOT NULL, [SoftwareVersion] [nvarchar](50) NOT NULL, + [Order] [int] NOT NULL DEFAULT 0 CONSTRAINT [PK_AggregateFilter] PRIMARY KEY CLUSTERED ( [ID] ASC @@ -464,6 +465,7 @@ CREATE TABLE [dbo].[ExtractionFilter]( [Name] [varchar](100) NOT NULL, [IsMandatory] [bit] NOT NULL, [SoftwareVersion] [nvarchar](50) NOT NULL, + [Order] [int] NOT NULL DEFAULT 0 CONSTRAINT [PK_ExtractionFilter] PRIMARY KEY CLUSTERED ( [ID] ASC diff --git a/Rdmp.Core/Databases/CatalogueDatabase/up/087_AddAggregateFilterOrdering.sql b/Rdmp.Core/Databases/CatalogueDatabase/up/087_AddAggregateFilterOrdering.sql new file mode 100644 index 0000000000..0d3e473416 --- /dev/null +++ b/Rdmp.Core/Databases/CatalogueDatabase/up/087_AddAggregateFilterOrdering.sql @@ -0,0 +1,13 @@ +----Version: 8.4.0 +----Description: Add Order to Aggregate Filters + +if not exists (select 1 from sys.columns where name = 'Order' and OBJECT_NAME(object_id) = 'AggregateFilter') +BEGIN +ALTER TABLE [dbo].[AggregateFilter] +ADD [Order] [int] NOT NULL DEFAULT 0 WITH VALUES +END +if not exists (select 1 from sys.columns where name = 'Order' and OBJECT_NAME(object_id) = 'ExtractionFilter') +BEGIN +ALTER TABLE [dbo].[ExtractionFilter] +ADD [Order] [int] NOT NULL DEFAULT 0 WITH VALUES +END \ No newline at end of file diff --git a/Rdmp.Core/Databases/DataExportDatabase/runAfterCreateDatabase/CreateDataExportManager.sql b/Rdmp.Core/Databases/DataExportDatabase/runAfterCreateDatabase/CreateDataExportManager.sql index 15f75c2cc478241235068c0ad4aa43f49a778571..ac1c03a92acf841ff12f997ebad6538dc418bdb8 100644 GIT binary patch delta 56 zcmcb1jc3kvo(&l?lM7tgCND6Sn%uz8H#sO-h|yp&BdhAV!Z delta 30 lcmbPpo#)y$o(&l?lM})fn$O9!pOaw(Vy5lqWSGzK0RYrq45$DA diff --git a/Rdmp.Core/Databases/DataExportDatabase/up/026_AddFilterOrder.sql b/Rdmp.Core/Databases/DataExportDatabase/up/026_AddFilterOrder.sql new file mode 100644 index 0000000000..1181d65c9b --- /dev/null +++ b/Rdmp.Core/Databases/DataExportDatabase/up/026_AddFilterOrder.sql @@ -0,0 +1,8 @@ +----Version: 8.4.0 +----Description: Add Order to Aggregate Filters + +if not exists (select 1 from sys.columns where name = 'Order' and OBJECT_NAME(object_id) = 'DeployedExtractionFilter') +BEGIN +ALTER TABLE [dbo].[DeployedExtractionFilter] +ADD [Order] [int] NOT NULL DEFAULT 0 WITH VALUES +END \ No newline at end of file diff --git a/Rdmp.Core/Rdmp.Core.csproj b/Rdmp.Core/Rdmp.Core.csproj index 449cc6abdb..3f2209d40d 100644 --- a/Rdmp.Core/Rdmp.Core.csproj +++ b/Rdmp.Core/Rdmp.Core.csproj @@ -129,6 +129,7 @@ + @@ -157,6 +158,7 @@ + @@ -255,6 +257,7 @@ + @@ -297,6 +300,7 @@ + diff --git a/Rdmp.UI/CommandExecution/AtomicCommands/ExecuteCommandReorderFilter.cs b/Rdmp.UI/CommandExecution/AtomicCommands/ExecuteCommandReorderFilter.cs new file mode 100644 index 0000000000..e6483fa5a1 --- /dev/null +++ b/Rdmp.UI/CommandExecution/AtomicCommands/ExecuteCommandReorderFilter.cs @@ -0,0 +1,62 @@ +// Copyright (c) The University of Dundee 2024-2024 +// This file is part of the Research Data Management Platform (RDMP). +// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// You should have received a copy of the GNU General Public License along with RDMP. If not, see . + +using Rdmp.Core.Curation.Data; +using Rdmp.Core.MapsDirectlyToDatabaseTable; +using Rdmp.UI.ItemActivation; +using System; +using System.Linq; + +namespace Rdmp.UI.CommandExecution.AtomicCommands; + +public class ExecuteCommandReorderFilter : BasicUICommandExecution +{ + private ConcreteFilter _source; + private ConcreteFilter _target; + private InsertOption _insertOption; + + public ExecuteCommandReorderFilter(IActivateItems activator, ConcreteFilter source, ConcreteFilter destination, InsertOption insertOption) : base(activator) + { + _source = source; + _target = destination; + _insertOption = insertOption; + if (_source.FilterContainer_ID is null || _target.FilterContainer_ID is null) + { + SetImpossible("Both filters must exist within some container in order to be orderable"); + } + if (_source.FilterContainer_ID != _target.FilterContainer_ID) + { + SetImpossible("Cannot reorder filters as they do not share a parent"); + } + } + + public override void Execute() + { + var order = _target.Order; + + var filters = _target.FilterContainer.GetFilters().Where(f => f is ConcreteFilter).Select(f => (ConcreteFilter)f).ToArray(); + Array.Sort( + filters, + delegate (ConcreteFilter a, ConcreteFilter b) { return a.Order.CompareTo(b.Order); } + ); + if (!filters.All(c => c.Order != order)) + { + foreach (var orderable in filters) + { + if (orderable.Order < order) + orderable.Order--; + else if (orderable.Order > order) + orderable.Order++; + else //collision on order + orderable.Order += _insertOption == InsertOption.InsertAbove ? 1 : -1; + ((ISaveable)orderable).SaveToDatabase(); + } + } + _source.Order = order; + _source.SaveToDatabase(); + Publish(_target.FilterContainer); + } +} diff --git a/Rdmp.UI/CommandExecution/Proposals/ProposeExecutionWhenTargetIsConcreteFilter.cs b/Rdmp.UI/CommandExecution/Proposals/ProposeExecutionWhenTargetIsConcreteFilter.cs index 93f6fb3c1c..98d290b1b0 100644 --- a/Rdmp.UI/CommandExecution/Proposals/ProposeExecutionWhenTargetIsConcreteFilter.cs +++ b/Rdmp.UI/CommandExecution/Proposals/ProposeExecutionWhenTargetIsConcreteFilter.cs @@ -5,7 +5,9 @@ // You should have received a copy of the GNU General Public License along with RDMP. If not, see . using Rdmp.Core.CommandExecution; +using Rdmp.Core.CommandExecution.Combining; using Rdmp.Core.Curation.Data; +using Rdmp.UI.CommandExecution.AtomicCommands; using Rdmp.UI.ExtractionUIs.FilterUIs; using Rdmp.UI.ItemActivation; @@ -24,8 +26,16 @@ public override void Activate(ConcreteFilter target) ItemActivator.Activate(target); } - public override ICommandExecution ProposeExecution(ICombineToMakeCommand cmd, ConcreteFilter target, - InsertOption insertOption = InsertOption.Default) => - //currently nothing can be dropped onto a filter - null; + public override ICommandExecution ProposeExecution(ICombineToMakeCommand cmd, ConcreteFilter targetFilter, + InsertOption insertOption = InsertOption.Default) + { + return cmd switch + { + FilterCombineable sourceFilterCommand => + !sourceFilterCommand.Filter.Equals(targetFilter) && sourceFilterCommand.Filter is ConcreteFilter && sourceFilterCommand.Filter.FilterContainer_ID == targetFilter.FilterContainer_ID ? + new ExecuteCommandReorderFilter(ItemActivator, (ConcreteFilter)sourceFilterCommand.Filter, targetFilter, insertOption) + : null, + _ => null + }; + } } \ No newline at end of file diff --git a/SharedAssemblyInfo.cs b/SharedAssemblyInfo.cs index 559a541653..dd904c2b97 100644 --- a/SharedAssemblyInfo.cs +++ b/SharedAssemblyInfo.cs @@ -10,6 +10,6 @@ [assembly: AssemblyTrademark("")] [assembly: AssemblyCulture("")] -[assembly: AssemblyVersion("8.3.0")] -[assembly: AssemblyFileVersion("8.3.0")] -[assembly: AssemblyInformationalVersion("8.3.0")] \ No newline at end of file +[assembly: AssemblyVersion("8.4.0")] +[assembly: AssemblyFileVersion("8.4.0")] +[assembly: AssemblyInformationalVersion("8.4.0")] \ No newline at end of file From 670a5f6d966c17af96cbba54c32b0d2a3c9e7e23 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 4 Nov 2024 14:37:57 +0000 Subject: [PATCH 03/35] interim --- Rdmp.Core/CommandLine/Options/DqeOptions.cs | 3 + Rdmp.Core/CommandLine/Runners/DqeRunner.cs | 10 +- .../Modules/Mutilators/DQEPostLoadRunner.cs | 1 + .../DataQualityEngine/Data/ColumnState.cs | 3 +- .../Reports/CatalogueConstraintReport.cs | 279 ++++++++++++------ .../Reports/DQEStateOverDataLoadRunId.cs | 5 + .../PeriodicityCubesOverTime.cs | 5 + 7 files changed, 215 insertions(+), 91 deletions(-) diff --git a/Rdmp.Core/CommandLine/Options/DqeOptions.cs b/Rdmp.Core/CommandLine/Options/DqeOptions.cs index 8b37f68a1b..b3e89a4425 100644 --- a/Rdmp.Core/CommandLine/Options/DqeOptions.cs +++ b/Rdmp.Core/CommandLine/Options/DqeOptions.cs @@ -16,4 +16,7 @@ public class DqeOptions : RDMPCommandLineOptions { [Option('c', "Catalogue", HelpText = "ID of the Catalogue to run the DQE on", Required = true)] public string Catalogue { get; set; } + + [Option('d', "DataLoad", HelpText = "ID of the Data Load to run the DQE on. Adds new data to existing DQE results if they exist", Required = false)] + public string DataLoadUpdateID { get; set; } } \ No newline at end of file diff --git a/Rdmp.Core/CommandLine/Runners/DqeRunner.cs b/Rdmp.Core/CommandLine/Runners/DqeRunner.cs index 2746c269ee..76aeeaba05 100644 --- a/Rdmp.Core/CommandLine/Runners/DqeRunner.cs +++ b/Rdmp.Core/CommandLine/Runners/DqeRunner.cs @@ -5,6 +5,7 @@ // You should have received a copy of the GNU General Public License along with RDMP. If not, see . using System; +using Org.BouncyCastle.Security.Certificates; using Rdmp.Core.CommandLine.Options; using Rdmp.Core.Curation.Data; using Rdmp.Core.DataFlowPipeline; @@ -29,12 +30,19 @@ public override int Run(IRDMPPlatformRepositoryServiceLocator repositoryLocator, ICheckNotifier checkNotifier, GracefulCancellationToken token) { var catalogue = GetObjectFromCommandLineString(repositoryLocator, _options.Catalogue); + int? dataLoadID = null; + if (_options.DataLoadUpdateID != null) + dataLoadID = int.Parse(_options.DataLoadUpdateID); + var report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID); switch (_options.Command) { case CommandLineActivity.run: - report.GenerateReport(catalogue, listener, token.AbortToken); + if (dataLoadID is not null) + report.UpdateReport(catalogue, (int)dataLoadID, listener, token.AbortToken); + else + report.GenerateReport(catalogue, listener, token.AbortToken); return 0; case CommandLineActivity.check: diff --git a/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs b/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs index 5d1f664ce2..4e4b79f0fd 100644 --- a/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs +++ b/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs @@ -73,6 +73,7 @@ public ExitCodeType Mutilate(IDataLoadJob job) DqeOptions options = new() { Catalogue = catalogue.ID.ToString(), + DataLoadUpdateID = job.DataLoadInfo.ID.ToString(), Command = CommandLineActivity.run }; var runner = RunnerFactory.CreateRunner(new ThrowImmediatelyActivator(job.RepositoryLocator), options); diff --git a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs index c3b7003975..7099c1a74b 100644 --- a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs @@ -133,7 +133,8 @@ protected ColumnState() public void Commit(Evaluation evaluation, string pivotCategory, DbConnection con, DbTransaction transaction) { if (IsCommitted) - throw new NotSupportedException("ColumnState was already committed"); + return; + //throw new NotSupportedException("ColumnState was already committed"); var sql = $"INSERT INTO ColumnState(TargetProperty,DataLoadRunID,Evaluation_ID,CountCorrect,CountDBNull,ItemValidatorXML,CountMissing,CountWrong,CountInvalidatesRow,PivotCategory)VALUES({"@TargetProperty"},{DataLoadRunID},{evaluation.ID},{CountCorrect},{CountDBNull},@ItemValidatorXML,{CountMissing},{CountWrong},{CountInvalidatesRow},@PivotCategory)"; diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index d7df51dc20..e8fa703e86 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -13,6 +13,7 @@ using FAnsi.Discovery; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Defaults; +using Rdmp.Core.DataLoad.Triggers; using Rdmp.Core.DataQualityEngine.Data; using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; using Rdmp.Core.Logging; @@ -51,6 +52,8 @@ public class CatalogueConstraintReport : DataQualityReport private string _loggingTask; private LogManager _logManager; + private int? _dataLoadID; + /// /// Set this property to use an explicit DQE results store database instead of the /// default DQE database indicated by the @@ -91,124 +94,132 @@ private void SetupLogging(ICatalogueRepository repository) private bool haveComplainedAboutNullCategories; - public override void GenerateReport(ICatalogue c, IDataLoadEventListener listener, - CancellationToken cancellationToken) - { - SetupLogging(c.CatalogueRepository); - var toDatabaseLogger = new ToLoggingDatabaseDataLoadEventListener(this, _logManager, _loggingTask, - $"DQE evaluation of {c}"); + private void BuildReportInternals(ICatalogue c, IDataLoadEventListener listener, + CancellationToken cancellationToken, ForkDataLoadEventListener forker, DQERepository dqeRepository) + { + byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); + byPivotRowStatesOverDataLoadRunId.Add("ALL", new DQEStateOverDataLoadRunId("ALL")); - var forker = new ForkDataLoadEventListener(listener, toDatabaseLogger); + Check(new FromDataLoadEventListenerToCheckNotifier(forker)); - try + var sw = Stopwatch.StartNew(); + using (var con = _server.GetConnection()) { - _catalogue = c; - var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); + con.Open(); + var qb = _queryBuilder; + if (_dataLoadID is not null) + qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, con); + cmd.CommandTimeout = 500000; - byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); - byPivotRowStatesOverDataLoadRunId.Add("ALL", new DQEStateOverDataLoadRunId("ALL")); + var t = cmd.ExecuteReaderAsync(cancellationToken); + t.Wait(cancellationToken); - Check(new FromDataLoadEventListenerToCheckNotifier(forker)); + if (cancellationToken.IsCancellationRequested) + throw new OperationCanceledException("User cancelled DQE while fetching data"); - var sw = Stopwatch.StartNew(); - using (var con = _server.GetConnection()) - { - con.Open(); + var r = t.Result; - var cmd = _server.GetCommand(_queryBuilder.SQL, con); - cmd.CommandTimeout = 500000; + var progress = 0; - var t = cmd.ExecuteReaderAsync(cancellationToken); - t.Wait(cancellationToken); + while (r.Read()) + { + cancellationToken.ThrowIfCancellationRequested(); - if (cancellationToken.IsCancellationRequested) - throw new OperationCanceledException("User cancelled DQE while fetching data"); + progress++; + var dataLoadRunIDOfCurrentRecord = 0; + //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) - var r = t.Result; + //if the DataReader is likely to have a data load run ID column + if (_containsDataLoadID) + { + //get data load run id + var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); + + //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) + if (runID != null) + dataLoadRunIDOfCurrentRecord = (int)runID; + } - var progress = 0; + string pivotValue = null; - while (r.Read()) + //if the user has a pivot category configured + if (_pivotCategory != null) { - cancellationToken.ThrowIfCancellationRequested(); - - progress++; - var dataLoadRunIDOfCurrentRecord = 0; - //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) + pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); - //if the DataReader is likely to have a data load run ID column - if (_containsDataLoadID) + if (!haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) { - //get data load run id - var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); - - //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) - if (runID != null) - dataLoadRunIDOfCurrentRecord = (int)runID; + forker.OnNotify(this, + new NotifyEventArgs(ProgressEventType.Warning, + $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); + haveComplainedAboutNullCategories = true; + pivotValue = null; } + } - string pivotValue = null; + //always increase the "ALL" category + ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, + byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); - //if the user has a pivot category configured - if (_pivotCategory != null) + //if there is a value in the current record for the pivot column + if (pivotValue != null) + { + //if it is a novel + if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) { - pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); - - if (!haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) - { - forker.OnNotify(this, - new NotifyEventArgs(ProgressEventType.Warning, - $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); - haveComplainedAboutNullCategories = true; - pivotValue = null; - } + //we will need to expand the dictionaries + if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) + throw new OverflowException( + $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); + + //expand both the time periodicity and the state results + byPivotRowStatesOverDataLoadRunId.Add(pivotValue, + new DQEStateOverDataLoadRunId(pivotValue)); + periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); + byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); } - //always increase the "ALL" category + //now we are sure that the dictionaries have the category field we can increment it ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, - byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); - - //if there is a value in the current record for the pivot column - if (pivotValue != null) - { - //if it is a novel - if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) - { - //we will need to expand the dictionaries - if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) - throw new OverflowException( - $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); - - //expand both the time periodicity and the state results - byPivotRowStatesOverDataLoadRunId.Add(pivotValue, - new DQEStateOverDataLoadRunId(pivotValue)); - periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); - byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); - } - - //now we are sure that the dictionaries have the category field we can increment it - ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, periodicityCubesOverTime, byPivotRowStatesOverDataLoadRunId[pivotValue]); - } - - if (progress % 5000 == 0) - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); } - //final value - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); - con.Close(); + if (progress % 5000 == 0) + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); } - sw.Stop(); + //final value + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); + con.Close(); + } + + sw.Stop(); + foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) + state.CalculateFinalValues(); + } + + public override void GenerateReport(ICatalogue c, IDataLoadEventListener listener, + CancellationToken cancellationToken) + { + SetupLogging(c.CatalogueRepository); + + var toDatabaseLogger = new ToLoggingDatabaseDataLoadEventListener(this, _logManager, _loggingTask, + $"DQE evaluation of {c}"); + + var forker = new ForkDataLoadEventListener(listener, toDatabaseLogger); + + try + { + _catalogue = c; + var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); + BuildReportInternals(c, listener, cancellationToken, forker, dqeRepository); - foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) - state.CalculateFinalValues(); //now commit results using (var con = dqeRepository.BeginNewTransactedConnection()) @@ -251,6 +262,96 @@ e is OperationCanceledException } } + + public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener listener, + CancellationToken cancellationToken) + { + _dataLoadID = dataLoadID; + SetupLogging(c.CatalogueRepository); + + var toDatabaseLogger = new ToLoggingDatabaseDataLoadEventListener(this, _logManager, _loggingTask, + $"DQE evaluation of {c}"); + + var forker = new ForkDataLoadEventListener(listener, toDatabaseLogger); + try + { + _catalogue = c; + var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); + //make report for new data + BuildReportInternals(c, listener, cancellationToken, forker, dqeRepository); + var newByPivotRowStatesOverDataLoadRunId = byPivotRowStatesOverDataLoadRunId; + var newByPivotCategoryCubesOverTime = byPivotCategoryCubesOverTime; + using (var con = dqeRepository.BeginNewTransactedConnection()) + { + try + { + //mark down that we are beginning an evaluation on this the day of our lord etc... + var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); + var evaluation = new Evaluation(dqeRepository, _catalogue); + foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) + { + //TODO find existing state and update - does this work? still to test + //var previousRowStates = previousEvaluation.RowStates.Where(r => r.PivotCategory == state.GetPiviotCategory()); + //foreach (var previousState in previousRowStates) + //{ + // state.RowsPassingValidationByDataLoadRunID[previousState.DataLoadRunID] = previousState.Correct; + // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID] = new Dictionary(); + // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID][Consequence.Missing] = previousState.Missing; + // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID][Consequence.Wrong] = previousState.Wrong; + // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID][Consequence.InvalidatesRow] = previousState.Invalid; + // var previousColumnStates = previousEvaluation.ColumnStates.Where(c => c.DataLoadRunID == previousState.DataLoadRunID); + + // state.AllColumnStates[previousState.DataLoadRunID] = previousColumnStates.ToArray(); + //} + state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); + } + foreach(var rowState in previousEvaluation.RowStates) + { + _ = new RowState(evaluation, rowState.DataLoadRunID,rowState.Correct,rowState.Missing, rowState.Wrong, rowState.Invalid,rowState.ValidatorXML,rowState.PivotCategory,con.Connection,con.Transaction); + } + + if (_timePeriodicityField != null) + foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) + { + //TODO find existing state and update + //var previousStates = previousEvaluation.ColumnStates.Where(c => c.PivotCategory == periodicity.GetPivotCategory()); + //foreach (var state in previousStates) + //{ + // //add old values onto new values + // periodicity. + //} + periodicity.CommitToDatabase(evaluation); + } + + dqeRepository.EndTransactedConnection(true); + } + catch (Exception) + { + dqeRepository.EndTransactedConnection(false); + throw; + } + } + + forker.OnNotify(this, + new NotifyEventArgs(ProgressEventType.Information, + "CatalogueConstraintReport completed successfully and committed results to DQE server")); + //make report for old data + //subtract old data + //add new data + } + catch (Exception e) + { + forker.OnNotify(this, + e is OperationCanceledException + ? new NotifyEventArgs(ProgressEventType.Warning, "DQE Execution Cancelled", e) + : new NotifyEventArgs(ProgressEventType.Error, "Fatal Crash", e)); + } + finally + { + toDatabaseLogger.FinalizeTableLoadInfos(); + } + } + private bool _haveComplainedAboutTrailingWhitespaces; private string GetStringValueForPivotField(object o, IDataLoadEventListener listener) diff --git a/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs b/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs index 7046a403f3..0a701dbce6 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/DQEStateOverDataLoadRunId.cs @@ -50,6 +50,11 @@ public void InitializeDictionaries() RowsPassingValidationByDataLoadRunID = new Dictionary(); } + public string GetPiviotCategory() + { + return _pivotCategory; + } + public void AddKeyToDictionaries(int dataLoadRunID, Validator validator, QueryBuilder queryBuilder) { //ensure keys exit (if it is a novel data load run ID then we will add it to the dictionaries diff --git a/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs b/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs index 730234ed2b..c35bbbdbf5 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs @@ -31,6 +31,11 @@ public PeriodicityCubesOverTime(string pivotCategory) _pivotCategory = pivotCategory; } + public string GetPivotCategory() + { + return _pivotCategory; + } + public static void PeriodicityCube() { } From 5222afaaabf6b54ee43f6a8825d6ac760d89d250 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 4 Nov 2024 15:53:33 +0000 Subject: [PATCH 04/35] improved add --- .../Reports/CatalogueConstraintReport.cs | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index e8fa703e86..dc387da284 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -6,6 +6,7 @@ using System; using System.Collections.Generic; +using System.Data; using System.Data.Common; using System.Diagnostics; using System.Linq; @@ -290,39 +291,39 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var evaluation = new Evaluation(dqeRepository, _catalogue); foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) { - //TODO find existing state and update - does this work? still to test - //var previousRowStates = previousEvaluation.RowStates.Where(r => r.PivotCategory == state.GetPiviotCategory()); - //foreach (var previousState in previousRowStates) - //{ - // state.RowsPassingValidationByDataLoadRunID[previousState.DataLoadRunID] = previousState.Correct; - // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID] = new Dictionary(); - // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID][Consequence.Missing] = previousState.Missing; - // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID][Consequence.Wrong] = previousState.Wrong; - // state.WorstConsequencesByDataLoadRunID[previousState.DataLoadRunID][Consequence.InvalidatesRow] = previousState.Invalid; - // var previousColumnStates = previousEvaluation.ColumnStates.Where(c => c.DataLoadRunID == previousState.DataLoadRunID); - - // state.AllColumnStates[previousState.DataLoadRunID] = previousColumnStates.ToArray(); - //} state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); } - foreach(var rowState in previousEvaluation.RowStates) + foreach (var rowState in previousEvaluation.RowStates) { - _ = new RowState(evaluation, rowState.DataLoadRunID,rowState.Correct,rowState.Missing, rowState.Wrong, rowState.Invalid,rowState.ValidatorXML,rowState.PivotCategory,con.Connection,con.Transaction); + _ = new RowState(evaluation, rowState.DataLoadRunID, rowState.Correct, rowState.Missing, rowState.Wrong, rowState.Invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } if (_timePeriodicityField != null) + { + var categories = previousEvaluation.RowStates.Select(r => r.PivotCategory).Distinct().ToList(); + foreach (var category in categories) + { + var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + if (!newByPivotCategoryCubesOverTime.ContainsKey(category)) + { + newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + } + //exists, just add + foreach (var row in periodicityDT.AsEnumerable()) + { + for (int i = 0; i < int.Parse(row.ItemArray[2].ToString()); i++) + { + Enum.TryParse(row.ItemArray[3].ToString(), out Consequence cons); + newByPivotCategoryCubesOverTime[category].IncrementHyperCube(DateTime.Parse(row.ItemArray[1].ToString()).Year, DateTime.Parse(row.ItemArray[1].ToString()).Month, cons); + } + } + + } foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) { - //TODO find existing state and update - //var previousStates = previousEvaluation.ColumnStates.Where(c => c.PivotCategory == periodicity.GetPivotCategory()); - //foreach (var state in previousStates) - //{ - // //add old values onto new values - // periodicity. - //} periodicity.CommitToDatabase(evaluation); } - + } dqeRepository.EndTransactedConnection(true); } catch (Exception) From 244e65393eb9048363e2d843a56f9d41f0690fbe Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 5 Nov 2024 10:24:23 +0000 Subject: [PATCH 05/35] interim --- .../Triggers/DiffDatabaseDataFetcher.cs | 4 +- .../Reports/CatalogueConstraintReport.cs | 12 + .../Reports/ReportBuilder.cs | 229 ++++++++++++++++++ 3 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs diff --git a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs index 7ccfe6f8a0..29e4f6d08e 100644 --- a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs +++ b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs @@ -113,7 +113,7 @@ public void FetchData(ICheckNotifier checkNotifier) CheckResult.Success)); GetInsertData(server, database, checkNotifier); - GetUpdatetData(server, database, checkNotifier); + GetUpdatedData(server, database, checkNotifier); } catch (Exception e) { @@ -163,7 +163,7 @@ private void GetInsertData(DiscoveredServer server, DiscoveredDatabase database, } - private void GetUpdatetData(DiscoveredServer server, DiscoveredDatabase database, ICheckNotifier checkNotifier) + private void GetUpdatedData(DiscoveredServer server, DiscoveredDatabase database, ICheckNotifier checkNotifier) { const string archive = "archive"; const string zzArchive = "zzarchivezz"; diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index dc387da284..a4db505c33 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -19,6 +19,7 @@ using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; using Rdmp.Core.Logging; using Rdmp.Core.Logging.Listeners; +using Rdmp.Core.MapsDirectlyToDatabaseTable; using Rdmp.Core.QueryBuilding; using Rdmp.Core.Repositories; using Rdmp.Core.ReusableLibraryCode.Checks; @@ -289,6 +290,17 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li //mark down that we are beginning an evaluation on this the day of our lord etc... var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); var evaluation = new Evaluation(dqeRepository, _catalogue); + + //find entries that have been put in the archive + //is this the correct table info? + var tableInfo = _catalogue.CatalogueItems.First().ColumnInfo.TableInfo; + var dataDiffFetcher = new DiffDatabaseDataFetcher(9, tableInfo, (int)_dataLoadID, 50000); + dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); + var replaced = dataDiffFetcher.Updates_Replaced; + var newRows = dataDiffFetcher.Updates_New; + var inserts = dataDiffFetcher.Inserts; + //i think we need to run a report on just the replaced rows and take them away from the new results + foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) { state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); diff --git a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs new file mode 100644 index 0000000000..359940681c --- /dev/null +++ b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs @@ -0,0 +1,229 @@ +using FAnsi.Discovery; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.DataLoad.Triggers; +using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; +using Rdmp.Core.Logging; +using Rdmp.Core.QueryBuilding; +using Rdmp.Core.Repositories; +using Rdmp.Core.ReusableLibraryCode.Progress; +using Rdmp.Core.Validation; +using Rdmp.Core.Validation.Constraints; +using System; +using System.Collections.Generic; +using System.Data.Common; +using System.Diagnostics; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace Rdmp.Core.DataQualityEngine.Reports; + +public class ReportBuilder +{ + private readonly string _dataLoadRunFieldName; + + //where the data is located + private DiscoveredServer _server; + private QueryBuilder _queryBuilder; + private Validator _validator; + private bool _containsDataLoadID; + + public static int MaximumPivotValues = 5000; + + private Dictionary byPivotRowStatesOverDataLoadRunId = new(); + private Dictionary byPivotCategoryCubesOverTime = new(); + + private IExternalDatabaseServer _loggingServer; + private string _loggingTask; + private LogManager _logManager; + + private int? _dataLoadID; + + private string _timePeriodicityField; + private string _pivotCategory; + private ICatalogue _catalogue; + private bool _haveComplainedAboutNullCategories; + private bool _haveComplainedAboutTrailingWhitespaces; + + public ReportBuilder(ICatalogue catalogue) { + _catalogue = catalogue; + } + + public void BuildReportInternals(IDataLoadEventListener listener, + CancellationToken cancellationToken, ForkDataLoadEventListener forker, DQERepository dqeRepository) + { + byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); + byPivotRowStatesOverDataLoadRunId.Add("ALL", new DQEStateOverDataLoadRunId("ALL")); + + //Check(new FromDataLoadEventListenerToCheckNotifier(forker)); + + var sw = Stopwatch.StartNew(); + using (var con = _server.GetConnection()) + { + con.Open(); + var qb = _queryBuilder; + if (_dataLoadID is not null) + qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, con); + cmd.CommandTimeout = 500000; + + var t = cmd.ExecuteReaderAsync(cancellationToken); + t.Wait(cancellationToken); + + if (cancellationToken.IsCancellationRequested) + throw new OperationCanceledException("User cancelled DQE while fetching data"); + + var r = t.Result; + + var progress = 0; + + while (r.Read()) + { + cancellationToken.ThrowIfCancellationRequested(); + + progress++; + var dataLoadRunIDOfCurrentRecord = 0; + //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) + + //if the DataReader is likely to have a data load run ID column + if (_containsDataLoadID) + { + //get data load run id + var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); + + //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) + if (runID != null) + dataLoadRunIDOfCurrentRecord = (int)runID; + } + + string pivotValue = null; + + //if the user has a pivot category configured + if (_pivotCategory != null) + { + pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); + + if (!_haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) + { + forker.OnNotify(this, + new NotifyEventArgs(ProgressEventType.Warning, + $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); + _haveComplainedAboutNullCategories = true; + pivotValue = null; + } + } + + //always increase the "ALL" category + ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, + byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); + + //if there is a value in the current record for the pivot column + if (pivotValue != null) + { + //if it is a novel + if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) + { + //we will need to expand the dictionaries + if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) + throw new OverflowException( + $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); + + //expand both the time periodicity and the state results + byPivotRowStatesOverDataLoadRunId.Add(pivotValue, + new DQEStateOverDataLoadRunId(pivotValue)); + periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); + byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); + } + + //now we are sure that the dictionaries have the category field we can increment it + ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, +periodicityCubesOverTime, byPivotRowStatesOverDataLoadRunId[pivotValue]); + } + + if (progress % 5000 == 0) + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); + } + + //final value + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); + con.Close(); + } + + sw.Stop(); + foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) + state.CalculateFinalValues(); + } + + private string GetStringValueForPivotField(object o, IDataLoadEventListener listener) + { + if (o == null || o == DBNull.Value) + return null; + + var stringValue = o.ToString(); + var trimmedValue = stringValue.Trim(); + + if (!_haveComplainedAboutTrailingWhitespaces && stringValue != trimmedValue) + { + listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, + $"Found trailing/leading whitespace in value in Pivot field, this will be trimmed off:'{o}'")); + _haveComplainedAboutTrailingWhitespaces = true; + } + + return trimmedValue; + } + private void ProcessRecord(DQERepository dqeRepository, int dataLoadRunIDOfCurrentRecord, DbDataReader r, + PeriodicityCubesOverTime periodicity, DQEStateOverDataLoadRunId states) + { + //make sure all the results dictionaries + states.AddKeyToDictionaries(dataLoadRunIDOfCurrentRecord, _validator, _queryBuilder); + + //ask the validator to validate! + _validator.ValidateVerboseAdditive( + r, //validate the data reader + states.ColumnValidationFailuresByDataLoadRunID[ + dataLoadRunIDOfCurrentRecord], //additively adjust the validation failures dictionary + out var worstConsequence); //and tell us what the worst consequence in the row was + + + //increment the time periodicity hypercube! + if (_timePeriodicityField != null) + { + DateTime? dt; + + try + { + dt = dqeRepository.ObjectToNullableDateTime(r[_timePeriodicityField]); + } + catch (InvalidCastException e) + { + throw new Exception( + $"Found value {r[_timePeriodicityField]} of type {r[_timePeriodicityField].GetType().Name} in your time periodicity field which was not a valid date time, make sure your time periodicity field is a datetime datatype", + e); + } + + if (dt != null) + periodicity.IncrementHyperCube(dt.Value.Year, dt.Value.Month, worstConsequence); + } + + //now we need to update everything we know about all the columns + foreach (var state in states.AllColumnStates[dataLoadRunIDOfCurrentRecord]) + { + //start out by assuming everything is dandy + state.CountCorrect++; + + if (r[state.TargetProperty] == DBNull.Value) + state.CountDBNull++; + } + + //update row level dictionaries + if (worstConsequence == null) + states.RowsPassingValidationByDataLoadRunID[dataLoadRunIDOfCurrentRecord]++; + else + states.WorstConsequencesByDataLoadRunID[dataLoadRunIDOfCurrentRecord][(Consequence)worstConsequence]++; + } +} From e9fec7aa14ebe46b51da1f208512f8f0b0f0db03 Mon Sep 17 00:00:00 2001 From: James Friel Date: Wed, 6 Nov 2024 15:31:35 +0000 Subject: [PATCH 06/35] interim --- .../Triggers/DiffDatabaseDataFetcher.cs | 24 +- .../Reports/CatalogueConstraintReport.cs | 261 +++++------------- .../Reports/ReportBuilder.cs | 169 ++++++------ .../QueryBuilding/CohortQueryBuilderResult.cs | 1 + 4 files changed, 174 insertions(+), 281 deletions(-) diff --git a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs index 29e4f6d08e..01adfa4272 100644 --- a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs +++ b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs @@ -191,7 +191,8 @@ private void GetUpdatedData(DiscoveredServer server, DiscoveredDatabase database --Records which appear in the archive SELECT top {{0}} {{6}}, -{{7}} +{{7}}, +{{8}} FROM {{1}} CROSS APPLY ( @@ -200,7 +201,7 @@ SELECT TOP 1 {{2}}.* WHERE {{3}} order by {syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom)} desc - ) {{8}} + ) {{9}} where {{1}}.{{4}} = {{5}}"; break; @@ -214,13 +215,14 @@ SELECT TOP 1 {{2}}.* /*Records which appear in the archive*/ SELECT {{6}}, -{{7}} +{{7}}, +{{8}} FROM {{1}} Join -{{2}} {{8}} on {whereStatement.Replace(archiveTableName, archive)} +{{2}} {{9}} on {whereStatement.Replace(archiveTableName, archive)} AND - {{8}}.{{9}} = (select max({syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom)}) from {{2}} s where {whereStatement.Replace(archiveTableName, archive).Replace(tableName, "s")}) + {{9}}.{{10}} = (select max({syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom)}) from {{2}} s where {whereStatement.Replace(archiveTableName, archive).Replace(tableName, "s")}) where {{1}}.{{4}} = {{5}} @@ -241,8 +243,9 @@ SELECT TOP 1 {{2}}.* _dataLoadRunID, //{5} GetSharedColumnsSQL(tableName), //{6} GetSharedColumnsSQLWithColumnAliasPrefix(archive, zzArchive), //{7} - archive, //{8} - syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom) + GetHICSpecialColumns(archive, zzArchive),//{8} + archive, //{9} + syntaxHelper.EnsureWrapped(SpecialFieldNames.ValidFrom) //{10} ); var dtComboTable = new DataTable(); @@ -272,6 +275,13 @@ SELECT TOP 1 {{2}}.* } } + private string GetHICSpecialColumns(string tableName, string columnAliasString) + { + return $@"{tableName}.{SpecialFieldNames.DataLoadRunID} as {SpecialFieldNames.DataLoadRunID}, +{tableName}.{SpecialFieldNames.ValidFrom} as {SpecialFieldNames.ValidFrom} +"; + } + private string GetSharedColumnsSQLWithColumnAliasPrefix(string tableName, string columnAliasPrefix) { var sb = new StringBuilder(); diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index a4db505c33..e689657d8a 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -94,120 +94,11 @@ private void SetupLogging(ICatalogueRepository repository) } } - private bool haveComplainedAboutNullCategories; + //private bool haveComplainedAboutNullCategories; - private void BuildReportInternals(ICatalogue c, IDataLoadEventListener listener, - CancellationToken cancellationToken, ForkDataLoadEventListener forker, DQERepository dqeRepository) - { - byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); - byPivotRowStatesOverDataLoadRunId.Add("ALL", new DQEStateOverDataLoadRunId("ALL")); - - Check(new FromDataLoadEventListenerToCheckNotifier(forker)); - - var sw = Stopwatch.StartNew(); - using (var con = _server.GetConnection()) - { - con.Open(); - var qb = _queryBuilder; - if (_dataLoadID is not null) - qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); - var cmd = _server.GetCommand(qb.SQL, con); - cmd.CommandTimeout = 500000; - - var t = cmd.ExecuteReaderAsync(cancellationToken); - t.Wait(cancellationToken); - - if (cancellationToken.IsCancellationRequested) - throw new OperationCanceledException("User cancelled DQE while fetching data"); - - var r = t.Result; - - var progress = 0; - - while (r.Read()) - { - cancellationToken.ThrowIfCancellationRequested(); - - progress++; - var dataLoadRunIDOfCurrentRecord = 0; - //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) - - //if the DataReader is likely to have a data load run ID column - if (_containsDataLoadID) - { - //get data load run id - var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); - - //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) - if (runID != null) - dataLoadRunIDOfCurrentRecord = (int)runID; - } - - string pivotValue = null; - - //if the user has a pivot category configured - if (_pivotCategory != null) - { - pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); - - if (!haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) - { - forker.OnNotify(this, - new NotifyEventArgs(ProgressEventType.Warning, - $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); - haveComplainedAboutNullCategories = true; - pivotValue = null; - } - } - - //always increase the "ALL" category - ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, - byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); - - //if there is a value in the current record for the pivot column - if (pivotValue != null) - { - //if it is a novel - if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) - { - //we will need to expand the dictionaries - if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) - throw new OverflowException( - $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); - - //expand both the time periodicity and the state results - byPivotRowStatesOverDataLoadRunId.Add(pivotValue, - new DQEStateOverDataLoadRunId(pivotValue)); - periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); - byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); - } - - //now we are sure that the dictionaries have the category field we can increment it - ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, -periodicityCubesOverTime, byPivotRowStatesOverDataLoadRunId[pivotValue]); - } - - if (progress % 5000 == 0) - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); - } - - //final value - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); - con.Close(); - } - - sw.Stop(); - foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) - state.CalculateFinalValues(); - } - public override void GenerateReport(ICatalogue c, IDataLoadEventListener listener, - CancellationToken cancellationToken) + CancellationToken cancellationToken) { SetupLogging(c.CatalogueRepository); @@ -220,7 +111,32 @@ public override void GenerateReport(ICatalogue c, IDataLoadEventListener listene { _catalogue = c; var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); - BuildReportInternals(c, listener, cancellationToken, forker, dqeRepository); + DbDataReader r; + Check(new FromDataLoadEventListenerToCheckNotifier(forker)); + using (var con = _server.GetConnection()) + { + con.Open(); + var qb = _queryBuilder; + if (_dataLoadID is not null) + qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, con); + cmd.CommandTimeout = 500000; + + var t = cmd.ExecuteReaderAsync(cancellationToken); + t.Wait(cancellationToken); + + if (cancellationToken.IsCancellationRequested) + throw new OperationCanceledException("User cancelled DQE while fetching data"); + + r = t.Result; + var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, r); + reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + + byPivotCategoryCubesOverTime = reportBuilder.GetByPivotCategoryCubesOverTime(); + byPivotRowStatesOverDataLoadRunId = reportBuilder.GetByPivotRowStatesOverDataLoadRunId(); + } + + //now commit results @@ -280,9 +196,30 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li _catalogue = c; var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); //make report for new data - BuildReportInternals(c, listener, cancellationToken, forker, dqeRepository); - var newByPivotRowStatesOverDataLoadRunId = byPivotRowStatesOverDataLoadRunId; - var newByPivotCategoryCubesOverTime = byPivotCategoryCubesOverTime; + DbDataReader r; + Check(new FromDataLoadEventListenerToCheckNotifier(forker)); + + using (var con = _server.GetConnection()) + { + con.Open(); + var qb = _queryBuilder; + if (_dataLoadID is not null) + qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, con); + cmd.CommandTimeout = 500000; + + var t = cmd.ExecuteReaderAsync(cancellationToken); + t.Wait(cancellationToken); + + if (cancellationToken.IsCancellationRequested) + throw new OperationCanceledException("User cancelled DQE while fetching data"); + + r = t.Result; + } + var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, r); + reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var newByPivotRowStatesOverDataLoadRunId = reportBuilder.GetByPivotRowStatesOverDataLoadRunId(); + var newByPivotCategoryCubesOverTime = reportBuilder.GetByPivotCategoryCubesOverTime(); using (var con = dqeRepository.BeginNewTransactedConnection()) { try @@ -300,15 +237,34 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var newRows = dataDiffFetcher.Updates_New; var inserts = dataDiffFetcher.Inserts; //i think we need to run a report on just the replaced rows and take them away from the new results - + var previousReportBuilder = new ReportBuilder(_catalogue, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, replaced); + previousReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var previousRows = previousReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); + var previousColumns = previousReportBuilder.GetByPivotCategoryCubesOverTime(); + //want to modify newByPivotRowStatesOverDataLoadRunId? foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) { state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); } foreach (var rowState in previousEvaluation.RowStates) + { + //reduce numbers based on the replaed dt + //will also need to update all(maybe?) + var pivotColumn = _catalogue.PivotCategory_ExtractionInformation.SelectSQL.Split('.').Last().Trim('[').Trim(']'); + //how do we know witch column the pivot category is from? + var matching = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID && row[pivotColumn].ToString() == rowState.PivotCategory).ToList(); + if (matching.Any()) + { + Console.WriteLine("do something"); + } + } + + foreach (var rowState in previousEvaluation.RowStates) { _ = new RowState(evaluation, rowState.DataLoadRunID, rowState.Correct, rowState.Missing, rowState.Wrong, rowState.Invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } + //actually want to create a new rowstate for each update (including all) + if (_timePeriodicityField != null) { @@ -348,9 +304,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li forker.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "CatalogueConstraintReport completed successfully and committed results to DQE server")); - //make report for old data - //subtract old data - //add new data } catch (Exception e) { @@ -365,26 +318,6 @@ e is OperationCanceledException } } - private bool _haveComplainedAboutTrailingWhitespaces; - - private string GetStringValueForPivotField(object o, IDataLoadEventListener listener) - { - if (o == null || o == DBNull.Value) - return null; - - var stringValue = o.ToString(); - var trimmedValue = stringValue.Trim(); - - if (!_haveComplainedAboutTrailingWhitespaces && stringValue != trimmedValue) - { - listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, - $"Found trailing/leading whitespace in value in Pivot field, this will be trimmed off:'{o}'")); - _haveComplainedAboutTrailingWhitespaces = true; - } - - return trimmedValue; - } - private string _timePeriodicityField; private string _pivotCategory; @@ -643,54 +576,4 @@ private void SetupAdditionalValidationRules(ICheckNotifier notifier) } } - private void ProcessRecord(DQERepository dqeRepository, int dataLoadRunIDOfCurrentRecord, DbDataReader r, - PeriodicityCubesOverTime periodicity, DQEStateOverDataLoadRunId states) - { - //make sure all the results dictionaries - states.AddKeyToDictionaries(dataLoadRunIDOfCurrentRecord, _validator, _queryBuilder); - - //ask the validator to validate! - _validator.ValidateVerboseAdditive( - r, //validate the data reader - states.ColumnValidationFailuresByDataLoadRunID[ - dataLoadRunIDOfCurrentRecord], //additively adjust the validation failures dictionary - out var worstConsequence); //and tell us what the worst consequence in the row was - - - //increment the time periodicity hypercube! - if (_timePeriodicityField != null) - { - DateTime? dt; - - try - { - dt = dqeRepository.ObjectToNullableDateTime(r[_timePeriodicityField]); - } - catch (InvalidCastException e) - { - throw new Exception( - $"Found value {r[_timePeriodicityField]} of type {r[_timePeriodicityField].GetType().Name} in your time periodicity field which was not a valid date time, make sure your time periodicity field is a datetime datatype", - e); - } - - if (dt != null) - periodicity.IncrementHyperCube(dt.Value.Year, dt.Value.Month, worstConsequence); - } - - //now we need to update everything we know about all the columns - foreach (var state in states.AllColumnStates[dataLoadRunIDOfCurrentRecord]) - { - //start out by assuming everything is dandy - state.CountCorrect++; - - if (r[state.TargetProperty] == DBNull.Value) - state.CountDBNull++; - } - - //update row level dictionaries - if (worstConsequence == null) - states.RowsPassingValidationByDataLoadRunID[dataLoadRunIDOfCurrentRecord]++; - else - states.WorstConsequencesByDataLoadRunID[dataLoadRunIDOfCurrentRecord][(Consequence)worstConsequence]++; - } } \ No newline at end of file diff --git a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs index 359940681c..38d63bff65 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs @@ -10,6 +10,7 @@ using Rdmp.Core.Validation.Constraints; using System; using System.Collections.Generic; +using System.Data; using System.Data.Common; using System.Diagnostics; using System.Linq; @@ -19,12 +20,11 @@ namespace Rdmp.Core.DataQualityEngine.Reports; -public class ReportBuilder +public class ReportBuilder { private readonly string _dataLoadRunFieldName; //where the data is located - private DiscoveredServer _server; private QueryBuilder _queryBuilder; private Validator _validator; private bool _containsDataLoadID; @@ -34,23 +34,40 @@ public class ReportBuilder private Dictionary byPivotRowStatesOverDataLoadRunId = new(); private Dictionary byPivotCategoryCubesOverTime = new(); - private IExternalDatabaseServer _loggingServer; - private string _loggingTask; - private LogManager _logManager; - - private int? _dataLoadID; - private string _timePeriodicityField; private string _pivotCategory; private ICatalogue _catalogue; private bool _haveComplainedAboutNullCategories; private bool _haveComplainedAboutTrailingWhitespaces; - public ReportBuilder(ICatalogue catalogue) { + private DataTable _resultsDT = new(); + public ReportBuilder(ICatalogue catalogue, Validator validator, QueryBuilder queryBuilder, string dataLoadRunFieldName, bool containsDataLoadID, string timePeriodicityField, string pivotCategory, DbDataReader results) + { + _catalogue = catalogue; + _validator = validator; + _queryBuilder = queryBuilder; + _dataLoadRunFieldName = dataLoadRunFieldName; + _containsDataLoadID = containsDataLoadID; + _timePeriodicityField = timePeriodicityField; + _pivotCategory = pivotCategory; + _resultsDT.Load(results); + } + public ReportBuilder(ICatalogue catalogue, Validator validator, QueryBuilder queryBuilder, string dataLoadRunFieldName, bool containsDataLoadID, string timePeriodicityField, string pivotCategory, DataTable results) + { _catalogue = catalogue; + _validator = validator; + _queryBuilder = queryBuilder; + _dataLoadRunFieldName = dataLoadRunFieldName; + _containsDataLoadID = containsDataLoadID; + _timePeriodicityField = timePeriodicityField; + _pivotCategory = pivotCategory; + _resultsDT = results; } - public void BuildReportInternals(IDataLoadEventListener listener, + public Dictionary GetByPivotRowStatesOverDataLoadRunId() => byPivotRowStatesOverDataLoadRunId; + public Dictionary GetByPivotCategoryCubesOverTime() => byPivotCategoryCubesOverTime; + + public void BuildReportInternals( CancellationToken cancellationToken, ForkDataLoadEventListener forker, DQERepository dqeRepository) { byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); @@ -59,101 +76,83 @@ public void BuildReportInternals(IDataLoadEventListener listener, //Check(new FromDataLoadEventListenerToCheckNotifier(forker)); var sw = Stopwatch.StartNew(); - using (var con = _server.GetConnection()) + var progress = 0; + + foreach (var r in _resultsDT.AsEnumerable()) { - con.Open(); - var qb = _queryBuilder; - if (_dataLoadID is not null) - qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); - var cmd = _server.GetCommand(qb.SQL, con); - cmd.CommandTimeout = 500000; + cancellationToken.ThrowIfCancellationRequested(); - var t = cmd.ExecuteReaderAsync(cancellationToken); - t.Wait(cancellationToken); + progress++; + var dataLoadRunIDOfCurrentRecord = 0; + //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) - if (cancellationToken.IsCancellationRequested) - throw new OperationCanceledException("User cancelled DQE while fetching data"); + //if the DataReader is likely to have a data load run ID column + if (_containsDataLoadID) + { + //get data load run id + var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); - var r = t.Result; + //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) + if (runID != null) + dataLoadRunIDOfCurrentRecord = (int)runID; + } - var progress = 0; + string pivotValue = null; - while (r.Read()) + //if the user has a pivot category configured + if (_pivotCategory != null) { - cancellationToken.ThrowIfCancellationRequested(); + pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); - progress++; - var dataLoadRunIDOfCurrentRecord = 0; - //to start with assume we will pass the results for the 'unknown batch' (where data load run ID is null or not available) - - //if the DataReader is likely to have a data load run ID column - if (_containsDataLoadID) + if (!_haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) { - //get data load run id - var runID = dqeRepository.ObjectToNullableInt(r[_dataLoadRunFieldName]); - - //if it has a value use it (otherwise it is null so use 0 - ugh I know, it's a primary key constraint issue) - if (runID != null) - dataLoadRunIDOfCurrentRecord = (int)runID; + forker.OnNotify(this, + new NotifyEventArgs(ProgressEventType.Warning, + $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); + _haveComplainedAboutNullCategories = true; + pivotValue = null; } + } - string pivotValue = null; + //always increase the "ALL" category + ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, + byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); - //if the user has a pivot category configured - if (_pivotCategory != null) + //if there is a value in the current record for the pivot column + if (pivotValue != null) + { + //if it is a novel + if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) { - pivotValue = GetStringValueForPivotField(r[_pivotCategory], forker); - - if (!_haveComplainedAboutNullCategories && string.IsNullOrWhiteSpace(pivotValue)) - { - forker.OnNotify(this, - new NotifyEventArgs(ProgressEventType.Warning, - $"Found a null/empty value for pivot category '{_pivotCategory}', this record will ONLY be recorded under ALL and not its specific category, you will not be warned of further nulls because there are likely to be many if there are any")); - _haveComplainedAboutNullCategories = true; - pivotValue = null; - } + //we will need to expand the dictionaries + if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) + throw new OverflowException( + $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); + + //expand both the time periodicity and the state results + byPivotRowStatesOverDataLoadRunId.Add(pivotValue, + new DQEStateOverDataLoadRunId(pivotValue)); + periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); + byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); } - //always increase the "ALL" category + //now we are sure that the dictionaries have the category field we can increment it ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, - byPivotCategoryCubesOverTime["ALL"], byPivotRowStatesOverDataLoadRunId["ALL"]); - - //if there is a value in the current record for the pivot column - if (pivotValue != null) - { - //if it is a novel - if (!byPivotCategoryCubesOverTime.TryGetValue(pivotValue, out var periodicityCubesOverTime)) - { - //we will need to expand the dictionaries - if (byPivotCategoryCubesOverTime.Keys.Count > MaximumPivotValues) - throw new OverflowException( - $"Encountered more than {MaximumPivotValues} values for the pivot column {_pivotCategory} this will result in crazy space usage since it is a multiplicative scale of DQE tesseracts"); - - //expand both the time periodicity and the state results - byPivotRowStatesOverDataLoadRunId.Add(pivotValue, - new DQEStateOverDataLoadRunId(pivotValue)); - periodicityCubesOverTime = new PeriodicityCubesOverTime(pivotValue); - byPivotCategoryCubesOverTime.Add(pivotValue, periodicityCubesOverTime); - } - - //now we are sure that the dictionaries have the category field we can increment it - ProcessRecord(dqeRepository, dataLoadRunIDOfCurrentRecord, r, periodicityCubesOverTime, byPivotRowStatesOverDataLoadRunId[pivotValue]); - } - - if (progress % 5000 == 0) - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); } - //final value - forker.OnProgress(this, - new ProgressEventArgs($"Processing {_catalogue}", - new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); - con.Close(); + if (progress % 5000 == 0) + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); } + //final value + forker.OnProgress(this, + new ProgressEventArgs($"Processing {_catalogue}", + new ProgressMeasurement(progress, ProgressType.Records), sw.Elapsed)); + + sw.Stop(); foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) state.CalculateFinalValues(); @@ -176,7 +175,7 @@ private string GetStringValueForPivotField(object o, IDataLoadEventListener list return trimmedValue; } - private void ProcessRecord(DQERepository dqeRepository, int dataLoadRunIDOfCurrentRecord, DbDataReader r, + private void ProcessRecord(DQERepository dqeRepository, int dataLoadRunIDOfCurrentRecord, DataRow r, PeriodicityCubesOverTime periodicity, DQEStateOverDataLoadRunId states) { //make sure all the results dictionaries diff --git a/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs b/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs index d960895b3c..dda7484df6 100644 --- a/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs +++ b/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs @@ -403,6 +403,7 @@ private void MakeCacheDecision() foreach (var dependency in Dependencies) { _log.AppendLine($"Evaluating '{dependency.CohortSet}'"); + var x = dependency.CohortSet.Catalogue.GetTableInfoList(false); foreach (var dependantTable in dependency.CohortSet.Catalogue.GetTableInfoList(false)) HandleDependency(dependency, false, dependantTable); From b30c682f0a3d8f8d5b25fb162ae1a458379d2f76 Mon Sep 17 00:00:00 2001 From: James Friel Date: Thu, 7 Nov 2024 13:31:58 +0000 Subject: [PATCH 07/35] interim --- .../Reports/CatalogueConstraintReport.cs | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index e689657d8a..0e2e3dfe72 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -233,34 +233,27 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var tableInfo = _catalogue.CatalogueItems.First().ColumnInfo.TableInfo; var dataDiffFetcher = new DiffDatabaseDataFetcher(9, tableInfo, (int)_dataLoadID, 50000); dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); - var replaced = dataDiffFetcher.Updates_Replaced; - var newRows = dataDiffFetcher.Updates_New; - var inserts = dataDiffFetcher.Inserts; - //i think we need to run a report on just the replaced rows and take them away from the new results + var replaced = dataDiffFetcher.Updates_Replaced; //all the stuff that has been replaced by the new data load + var previousReportBuilder = new ReportBuilder(_catalogue, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, replaced); previousReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); var previousRows = previousReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); var previousColumns = previousReportBuilder.GetByPivotCategoryCubesOverTime(); - //want to modify newByPivotRowStatesOverDataLoadRunId? + foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) { state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); } + foreach (var rowState in previousEvaluation.RowStates) { - //reduce numbers based on the replaed dt - //will also need to update all(maybe?) - var pivotColumn = _catalogue.PivotCategory_ExtractionInformation.SelectSQL.Split('.').Last().Trim('[').Trim(']'); - //how do we know witch column the pivot category is from? - var matching = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID && row[pivotColumn].ToString() == rowState.PivotCategory).ToList(); - if (matching.Any()) + if(replaced.AsEnumerable().Any() && int.Parse(replaced.AsEnumerable().First()[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID) { - Console.WriteLine("do something"); + var dqeState = previousRows[rowState.PivotCategory];//how to get the row state from a report? + Console.WriteLine("test"); + //var correct = rowState.Correct - } - } - foreach (var rowState in previousEvaluation.RowStates) - { _ = new RowState(evaluation, rowState.DataLoadRunID, rowState.Correct, rowState.Missing, rowState.Wrong, rowState.Invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } //actually want to create a new rowstate for each update (including all) From 26b9b3e98d08e40ab0b64be0a3e7bbb63b5c2301 Mon Sep 17 00:00:00 2001 From: James Friel Date: Fri, 8 Nov 2024 11:39:03 +0000 Subject: [PATCH 08/35] all not working --- .../Reports/CatalogueConstraintReport.cs | 70 ++++++++++++++----- .../Reports/ReportBuilder.cs | 2 - 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 0e2e3dfe72..25f10bf436 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -196,7 +196,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li _catalogue = c; var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); //make report for new data - DbDataReader r; + DataTable rDT = new(); Check(new FromDataLoadEventListenerToCheckNotifier(forker)); using (var con = _server.GetConnection()) @@ -207,16 +207,20 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); var cmd = _server.GetCommand(qb.SQL, con); cmd.CommandTimeout = 500000; - - var t = cmd.ExecuteReaderAsync(cancellationToken); - t.Wait(cancellationToken); - - if (cancellationToken.IsCancellationRequested) - throw new OperationCanceledException("User cancelled DQE while fetching data"); - - r = t.Result; + var adapter = _server.GetDataAdapter(cmd); + rDT.BeginLoadData(); + adapter.Fill(rDT); + rDT.EndLoadData(); + con.Close(); + //var t = cmd.ExecuteReaderAsync(cancellationToken); + //t.Wait(cancellationToken); + + //if (cancellationToken.IsCancellationRequested) + // throw new OperationCanceledException("User cancelled DQE while fetching data"); + + //r = t.Result; } - var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, r); + var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, rDT); reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); var newByPivotRowStatesOverDataLoadRunId = reportBuilder.GetByPivotRowStatesOverDataLoadRunId(); var newByPivotCategoryCubesOverTime = reportBuilder.GetByPivotCategoryCubesOverTime(); @@ -240,21 +244,55 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var previousRows = previousReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); var previousColumns = previousReportBuilder.GetByPivotCategoryCubesOverTime(); + var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.Name; + foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) { state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); } - + foreach (var rowState in previousEvaluation.RowStates) { - if(replaced.AsEnumerable().Any() && int.Parse(replaced.AsEnumerable().First()[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID) + //if(replaced.AsEnumerable().Any() && int.Parse(replaced.AsEnumerable().First()[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID) + //{ + // var dqeState = previousRows[rowState.PivotCategory];//how to get the row state from a report? + // Console.WriteLine("test"); + // //var correct = rowState.Correct - + //} + var correct = rowState.Correct; + var missing = rowState.Missing; + var wrong = rowState.Wrong; + var invalid = rowState.Invalid; + // if(replaced.AsEnumerable().Any() && rowState.PivotCategory != "ALL") + // { + // var matchingRows = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID && row[pivotColumn].ToString() == rowState.PivotCategory); + // //can we find these rows in the previous rows? + // var pivotCategoryRow = previousRows[rowState.PivotCategory]; + // var oldTotal = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[dataLoadRunID]; + //WorstConsequencesByDataLoadRunID[dataLoadRunID][Consequence.Missing], + //WorstConsequencesByDataLoadRunID[dataLoadRunID][Consequence.Wrong], + //WorstConsequencesByDataLoadRunID[dataLoadRunID][Consequence.InvalidatesRow] + // var x = previousRows; + // } + if (replaced.AsEnumerable().Any() && previousRows.TryGetValue(rowState.PivotCategory, out var pivotCategoryRow)) { - var dqeState = previousRows[rowState.PivotCategory];//how to get the row state from a report? - Console.WriteLine("test"); - //var correct = rowState.Correct - + var oldCorrect = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[0]; + var oldMissing = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Missing]; + var oldWrong = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Wrong]; + var oldInvalid = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.InvalidatesRow]; + correct -= oldCorrect; + missing -= oldMissing; + wrong -= oldWrong; + invalid -= oldInvalid; + Console.WriteLine("1"); } - _ = new RowState(evaluation, rowState.DataLoadRunID, rowState.Correct, rowState.Missing, rowState.Wrong, rowState.Invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); + if (rowState.PivotCategory == "ALL") + { + //todo ...something + } + if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; + _ = new RowState(evaluation, rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } //actually want to create a new rowstate for each update (including all) diff --git a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs index 38d63bff65..b60e097b2b 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs @@ -73,8 +73,6 @@ public void BuildReportInternals( byPivotCategoryCubesOverTime.Add("ALL", new PeriodicityCubesOverTime("ALL")); byPivotRowStatesOverDataLoadRunId.Add("ALL", new DQEStateOverDataLoadRunId("ALL")); - //Check(new FromDataLoadEventListenerToCheckNotifier(forker)); - var sw = Stopwatch.StartNew(); var progress = 0; From 212736b74fcf7c27d829d9df1373e2b406a8d56c Mon Sep 17 00:00:00 2001 From: James Friel Date: Fri, 8 Nov 2024 13:05:16 +0000 Subject: [PATCH 09/35] correct rows --- .../Reports/CatalogueConstraintReport.cs | 30 ++++--------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 25f10bf436..4e70ba3187 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -196,7 +196,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li _catalogue = c; var dqeRepository = ExplicitDQERepository ?? new DQERepository(c.CatalogueRepository); //make report for new data - DataTable rDT = new(); + DataTable rDT = new(); Check(new FromDataLoadEventListenerToCheckNotifier(forker)); using (var con = _server.GetConnection()) @@ -253,27 +253,12 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li foreach (var rowState in previousEvaluation.RowStates) { - //if(replaced.AsEnumerable().Any() && int.Parse(replaced.AsEnumerable().First()[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID) - //{ - // var dqeState = previousRows[rowState.PivotCategory];//how to get the row state from a report? - // Console.WriteLine("test"); - // //var correct = rowState.Correct - - //} var correct = rowState.Correct; var missing = rowState.Missing; var wrong = rowState.Wrong; var invalid = rowState.Invalid; - // if(replaced.AsEnumerable().Any() && rowState.PivotCategory != "ALL") - // { - // var matchingRows = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID && row[pivotColumn].ToString() == rowState.PivotCategory); - // //can we find these rows in the previous rows? - // var pivotCategoryRow = previousRows[rowState.PivotCategory]; - // var oldTotal = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[dataLoadRunID]; - //WorstConsequencesByDataLoadRunID[dataLoadRunID][Consequence.Missing], - //WorstConsequencesByDataLoadRunID[dataLoadRunID][Consequence.Wrong], - //WorstConsequencesByDataLoadRunID[dataLoadRunID][Consequence.InvalidatesRow] - // var x = previousRows; - // } + //if (rowState.PivotCategory != "ALL") + //{ if (replaced.AsEnumerable().Any() && previousRows.TryGetValue(rowState.PivotCategory, out var pivotCategoryRow)) { var oldCorrect = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[0]; @@ -287,15 +272,12 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li Console.WriteLine("1"); } - if (rowState.PivotCategory == "ALL") - { - //todo ...something - } if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; + //} + + _ = new RowState(evaluation, rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } - //actually want to create a new rowstate for each update (including all) - if (_timePeriodicityField != null) { From f9f16707caae8032e12e868c1158907e49097076 Mon Sep 17 00:00:00 2001 From: James Friel Date: Fri, 8 Nov 2024 13:51:36 +0000 Subject: [PATCH 10/35] add todo --- .../Reports/CatalogueConstraintReport.cs | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 4e70ba3187..03e2d616de 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -257,8 +257,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var missing = rowState.Missing; var wrong = rowState.Wrong; var invalid = rowState.Invalid; - //if (rowState.PivotCategory != "ALL") - //{ if (replaced.AsEnumerable().Any() && previousRows.TryGetValue(rowState.PivotCategory, out var pivotCategoryRow)) { var oldCorrect = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[0]; @@ -273,11 +271,31 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; - //} - _ = new RowState(evaluation, rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } + //TODO need to update the column states ad the counts for ALL are wrong + foreach (var columnState in previousEvaluation.ColumnStates) + { + var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) + { + CountMissing = columnState.CountMissing, + CountWrong = columnState.CountWrong, + CountInvalidatesRow = columnState.CountInvalidatesRow, + CountCorrect = columnState.CountCorrect, + CountDBNull = columnState.CountDBNull + }; + //if (replaced.AsEnumerable().Any() && previousColumns.TryGetValue(columnState.PivotCategory, out var pivotCategoryColumn)) { + // var oldCorect = pivotCategoryColumn.hyperCube[columnState.y] + + + //} + if(replaced.AsEnumerable().Any()) + { + + } + cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); + } if (_timePeriodicityField != null) { From cd08015a060e41c7efd7c675a67bab4d78b44ffc Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 11 Nov 2024 14:54:00 +0000 Subject: [PATCH 11/35] columns still not working --- .../Reports/CatalogueConstraintReport.cs | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 03e2d616de..0281e17a82 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -12,6 +12,7 @@ using System.Linq; using System.Threading; using FAnsi.Discovery; +using Org.BouncyCastle.Security.Certificates; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Defaults; using Rdmp.Core.DataLoad.Triggers; @@ -28,6 +29,7 @@ using Rdmp.Core.Validation; using Rdmp.Core.Validation.Constraints; using Rdmp.Core.Validation.Constraints.Secondary.Predictor; +using static Terminal.Gui.Application; namespace Rdmp.Core.DataQualityEngine.Reports; @@ -285,14 +287,30 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li CountCorrect = columnState.CountCorrect, CountDBNull = columnState.CountDBNull }; - //if (replaced.AsEnumerable().Any() && previousColumns.TryGetValue(columnState.PivotCategory, out var pivotCategoryColumn)) { - // var oldCorect = pivotCategoryColumn.hyperCube[columnState.y] + var x = previousRows.TryGetValue(columnState.PivotCategory, out var pivotCategoryRow); + if (pivotCategoryRow != null) + { + pivotCategoryRow.AllColumnStates.TryGetValue(columnState.DataLoadRunID, out var allcolumnStates); + if (allcolumnStates is not null) + { + //var allcolumnStates = pivotCategoryRow.AllColumnStates[columnState.DataLoadRunID]; + var col = allcolumnStates.Where(c => c.TargetProperty == columnState.TargetProperty).FirstOrDefault(); + if (col is not null) + { + //cs.CountMissing -= col.CountMissing; + cs.CountMissing = col.CountMissing - cs.CountMissing; + cs.CountWrong -= col.CountWrong; + cs.CountInvalidatesRow -= col.CountInvalidatesRow; + cs.CountCorrect -= col.CountCorrect; + cs.CountDBNull -= col.CountDBNull; + } + } + } - //} - if(replaced.AsEnumerable().Any()) + if (replaced.AsEnumerable().Any() && previousColumns.TryGetValue(columnState.PivotCategory, out var pivotCategoryColumns)) { - + var y = pivotCategoryColumns; } cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); } From a0b7ad169d32b82a45e09182faabfd8bb0160bf8 Mon Sep 17 00:00:00 2001 From: James Friel Date: Thu, 14 Nov 2024 11:06:10 +0000 Subject: [PATCH 12/35] working periodicity state --- .../Reports/CatalogueConstraintReport.cs | 51 +++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 0281e17a82..9a9faded3b 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -12,6 +12,7 @@ using System.Linq; using System.Threading; using FAnsi.Discovery; +using MongoDB.Driver; using Org.BouncyCastle.Security.Certificates; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Defaults; @@ -96,9 +97,6 @@ private void SetupLogging(ICatalogueRepository repository) } } - //private bool haveComplainedAboutNullCategories; - - public override void GenerateReport(ICatalogue c, IDataLoadEventListener listener, CancellationToken cancellationToken) { @@ -214,13 +212,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li adapter.Fill(rDT); rDT.EndLoadData(); con.Close(); - //var t = cmd.ExecuteReaderAsync(cancellationToken); - //t.Wait(cancellationToken); - - //if (cancellationToken.IsCancellationRequested) - // throw new OperationCanceledException("User cancelled DQE while fetching data"); - - //r = t.Result; } var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, rDT); reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); @@ -246,7 +237,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var previousRows = previousReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); var previousColumns = previousReportBuilder.GetByPivotCategoryCubesOverTime(); - var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.Name; + var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) { @@ -276,7 +267,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li _ = new RowState(evaluation, rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } - //TODO need to update the column states ad the counts for ALL are wrong foreach (var columnState in previousEvaluation.ColumnStates) { var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) @@ -290,7 +280,9 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var x = previousRows.TryGetValue(columnState.PivotCategory, out var pivotCategoryRow); if (pivotCategoryRow != null) { - pivotCategoryRow.AllColumnStates.TryGetValue(columnState.DataLoadRunID, out var allcolumnStates); + //they all seem to be dataLoadId [0], but should check this is true + //pivotCategoryRow.AllColumnStates.TryGetValue(columnState.DataLoadRunID, out var allcolumnStates); + pivotCategoryRow.AllColumnStates.TryGetValue(0, out var allcolumnStates); if (allcolumnStates is not null) { //var allcolumnStates = pivotCategoryRow.AllColumnStates[columnState.DataLoadRunID]; @@ -312,6 +304,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li { var y = pivotCategoryColumns; } + if (cs.CountCorrect < 1 && cs.CountMissing < 1 && cs.CountWrong < 1 && cs.CountInvalidatesRow < 1) continue; cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); } @@ -321,6 +314,17 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li foreach (var category in categories) { var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + var matchingReplaced = replaced.AsEnumerable(); + if (category != "ALL") + { + matchingReplaced = matchingReplaced.Where(row => row[pivotColumn].ToString() == category); + } + //do we validate the row here... + //var x = _validator.ItemValidators; + //foreach(var validator in _validator.ItemValidators) + //{ + // validator.ValidateAll() + //} if (!newByPivotCategoryCubesOverTime.ContainsKey(category)) { newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); @@ -328,9 +332,26 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li //exists, just add foreach (var row in periodicityDT.AsEnumerable()) { - for (int i = 0; i < int.Parse(row.ItemArray[2].ToString()); i++) + Enum.TryParse(row.ItemArray[3].ToString(), out Consequence cons); + var count = int.Parse(row.ItemArray[2].ToString()); + foreach(var replacedRow in matchingReplaced) + { + foreach(var validator in _validator.ItemValidators.Where(iv => iv.PrimaryConstraint.Consequence == cons)) //only use validators with matching consequences + { + var cols = replaced.Columns.Cast().Where(c => c.ColumnName != validator.TargetProperty); + var result = validator.ValidateAll(replacedRow[validator.TargetProperty], cols.ToArray(), cols.Select(c => c.ColumnName).ToArray()); + if (result != null) + { + count -= 1; + } + var res = validator;//.ValidateAll(row[validator.PrimaryConstraint.col]) + + } + } + + for (int i = 0; i < count; i++) { - Enum.TryParse(row.ItemArray[3].ToString(), out Consequence cons); + //TODO this is where the periodicitystates are generated from newByPivotCategoryCubesOverTime[category].IncrementHyperCube(DateTime.Parse(row.ItemArray[1].ToString()).Year, DateTime.Parse(row.ItemArray[1].ToString()).Month, cons); } } From 14f5163c6375bf3bb211ca7d9d16dbd4dde0e5fb Mon Sep 17 00:00:00 2001 From: James Friel Date: Fri, 15 Nov 2024 11:45:37 +0000 Subject: [PATCH 13/35] promising --- .../Triggers/DiffDatabaseDataFetcher.cs | 11 +- .../Reports/CatalogueConstraintReport.cs | 359 +++++++++++++----- 2 files changed, 269 insertions(+), 101 deletions(-) diff --git a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs index 01adfa4272..f16b7672ed 100644 --- a/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs +++ b/Rdmp.Core/DataLoad/Triggers/DiffDatabaseDataFetcher.cs @@ -10,6 +10,7 @@ using System.Text; using FAnsi; using FAnsi.Discovery; +using MongoDB.Driver; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Spontaneous; using Rdmp.Core.QueryBuilding; @@ -256,11 +257,15 @@ SELECT TOP 1 {{2}}.* //add the columns from the combo table to both views foreach (DataColumn col in dtComboTable.Columns) + { if (!col.ColumnName.StartsWith(zzArchive, StringComparison.InvariantCultureIgnoreCase)) { Updates_New.Columns.Add(col.ColumnName, col.DataType); Updates_Replaced.Columns.Add(col.ColumnName, col.DataType); } + } + Updates_Replaced.Columns.Add(SpecialFieldNames.DataLoadRunID, typeof(int)); + Updates_Replaced.Columns.Add(SpecialFieldNames.ValidFrom, typeof(DateTime)); foreach (DataRow fromRow in dtComboTable.Rows) { @@ -275,10 +280,10 @@ SELECT TOP 1 {{2}}.* } } - private string GetHICSpecialColumns(string tableName, string columnAliasString) + private string GetHICSpecialColumns(string tableName, string columnAliasPrefix = "") { - return $@"{tableName}.{SpecialFieldNames.DataLoadRunID} as {SpecialFieldNames.DataLoadRunID}, -{tableName}.{SpecialFieldNames.ValidFrom} as {SpecialFieldNames.ValidFrom} + return $@"{tableName}.{SpecialFieldNames.DataLoadRunID} as {columnAliasPrefix}{SpecialFieldNames.DataLoadRunID}, +{tableName}.{SpecialFieldNames.ValidFrom} as {columnAliasPrefix}{SpecialFieldNames.ValidFrom} "; } diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 9a9faded3b..95373655eb 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -10,9 +10,13 @@ using System.Data.Common; using System.Diagnostics; using System.Linq; +using System.Reflection; +using System.Runtime.InteropServices; using System.Threading; using FAnsi.Discovery; using MongoDB.Driver; +using NPOI.OpenXmlFormats.Vml; +using NPOI.Util; using Org.BouncyCastle.Security.Certificates; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Defaults; @@ -221,147 +225,306 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li { try { - //mark down that we are beginning an evaluation on this the day of our lord etc... var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); var evaluation = new Evaluation(dqeRepository, _catalogue); - - //find entries that have been put in the archive - //is this the correct table info? + //what about periodicityValues? var tableInfo = _catalogue.CatalogueItems.First().ColumnInfo.TableInfo; - var dataDiffFetcher = new DiffDatabaseDataFetcher(9, tableInfo, (int)_dataLoadID, 50000); + var dataDiffFetcher = new DiffDatabaseDataFetcher(10000000, tableInfo, (int)_dataLoadID, 50000);//todo update these numbers dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); - var replaced = dataDiffFetcher.Updates_Replaced; //all the stuff that has been replaced by the new data load - - var previousReportBuilder = new ReportBuilder(_catalogue, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, replaced); - previousReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); - var previousRows = previousReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); - var previousColumns = previousReportBuilder.GetByPivotCategoryCubesOverTime(); - + var replaced = dataDiffFetcher.Updates_Replaced; var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); - - foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) - { - state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); - } - + var timeColumn = c.TimeCoverage_ExtractionInformation.ColumnInfo.GetRuntimeName(); foreach (var rowState in previousEvaluation.RowStates) { var correct = rowState.Correct; var missing = rowState.Missing; var wrong = rowState.Wrong; var invalid = rowState.Invalid; - if (replaced.AsEnumerable().Any() && previousRows.TryGetValue(rowState.PivotCategory, out var pivotCategoryRow)) + var matchingReplacements = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID); + if (rowState.PivotCategory != "ALL") { - var oldCorrect = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[0]; - var oldMissing = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Missing]; - var oldWrong = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Wrong]; - var oldInvalid = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.InvalidatesRow]; - correct -= oldCorrect; - missing -= oldMissing; - wrong -= oldWrong; - invalid -= oldInvalid; - Console.WriteLine("1"); + matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == rowState.PivotCategory); + } + foreach (var replacement in matchingReplacements) + { + foreach (var itemValidator in _validator.ItemValidators) + { + var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + if (result is not null) + { + if (result.SourceConstraint.Consequence == Consequence.Missing) + { + missing -= 1; + } + else if (result.SourceConstraint.Consequence == Consequence.Wrong) + { + wrong -= 1; + } + else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) + { + invalid -= 1; + } + } + else + { + correct -= 1; + } + } } - if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; - - _ = new RowState(evaluation, rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); + evaluation.AddRowState(rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } foreach (var columnState in previousEvaluation.ColumnStates) { - var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) + var dbNull = columnState.CountDBNull; + var missing = columnState.CountMissing; + var wrong = columnState.CountWrong; + var invalid = columnState.CountInvalidatesRow; + var correct = columnState.CountCorrect; + var matchingReplacements = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == columnState.DataLoadRunID); + if (columnState.PivotCategory != "ALL") { - CountMissing = columnState.CountMissing, - CountWrong = columnState.CountWrong, - CountInvalidatesRow = columnState.CountInvalidatesRow, - CountCorrect = columnState.CountCorrect, - CountDBNull = columnState.CountDBNull - }; - var x = previousRows.TryGetValue(columnState.PivotCategory, out var pivotCategoryRow); - if (pivotCategoryRow != null) + matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == columnState.PivotCategory); + } + foreach (var replacement in matchingReplacements) { - //they all seem to be dataLoadId [0], but should check this is true - //pivotCategoryRow.AllColumnStates.TryGetValue(columnState.DataLoadRunID, out var allcolumnStates); - pivotCategoryRow.AllColumnStates.TryGetValue(0, out var allcolumnStates); - if (allcolumnStates is not null) + var itemValidators = _validator.ItemValidators.Where(iv => iv.TargetProperty == columnState.TargetProperty); + if (itemValidators.Any()) { - //var allcolumnStates = pivotCategoryRow.AllColumnStates[columnState.DataLoadRunID]; - var col = allcolumnStates.Where(c => c.TargetProperty == columnState.TargetProperty).FirstOrDefault(); - if (col is not null) + foreach (var itemValidator in itemValidators) { - //cs.CountMissing -= col.CountMissing; - cs.CountMissing = col.CountMissing - cs.CountMissing; - cs.CountWrong -= col.CountWrong; - cs.CountInvalidatesRow -= col.CountInvalidatesRow; - cs.CountCorrect -= col.CountCorrect; - cs.CountDBNull -= col.CountDBNull; + var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + if (result.SourceConstraint.Consequence == Consequence.Missing) + { + missing -= 1; + } + else if (result.SourceConstraint.Consequence == Consequence.Wrong) + { + wrong -= 1; + } + else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) + { + invalid -= 1; + } } } + else + { + correct -= 1; //remove a correct entry + } } + if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; - - if (replaced.AsEnumerable().Any() && previousColumns.TryGetValue(columnState.PivotCategory, out var pivotCategoryColumns)) + var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) { - var y = pivotCategoryColumns; - } - if (cs.CountCorrect < 1 && cs.CountMissing < 1 && cs.CountWrong < 1 && cs.CountInvalidatesRow < 1) continue; + CountMissing = missing, + CountWrong = wrong, + CountInvalidatesRow = invalid, + CountCorrect = correct, + CountDBNull = dbNull + }; cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); } - if (_timePeriodicityField != null) + var categories = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).ToList().Distinct(); + foreach (var category in categories) { - var categories = previousEvaluation.RowStates.Select(r => r.PivotCategory).Distinct().ToList(); - foreach (var category in categories) + //this is working well, but it's not decresing the periodicityState when the value is replaced + var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + var periodicityCube = new PeriodicityCubesOverTime(category); + newByPivotCategoryCubesOverTime.TryGetValue(category, out PeriodicityCubesOverTime value); + if (value is not null) + { + periodicityCube = value; + } + foreach (var row in previousPeriodicity.AsEnumerable()) { - var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - var matchingReplaced = replaced.AsEnumerable(); + var year = DateTime.Parse(row["YearMonth"].ToString()).Year; + var month = DateTime.Parse(row["YearMonth"].ToString()).Month; + _ = Enum.TryParse(row["RowEvaluation"].ToString(), out Consequence cons); + + var count = int.Parse(row["CountOfRecords"].ToString()); + var matchingReplacements = replaced.AsEnumerable(); if (category != "ALL") { - matchingReplaced = matchingReplaced.Where(row => row[pivotColumn].ToString() == category); - } - //do we validate the row here... - //var x = _validator.ItemValidators; - //foreach(var validator in _validator.ItemValidators) - //{ - // validator.ValidateAll() - //} - if (!newByPivotCategoryCubesOverTime.ContainsKey(category)) - { - newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == category && DateTime.Parse(row[timeColumn].ToString()).Year == year && DateTime.Parse(row[timeColumn].ToString()).Month == month); } - //exists, just add - foreach (var row in periodicityDT.AsEnumerable()) + + foreach (var replacement in matchingReplacements) { - Enum.TryParse(row.ItemArray[3].ToString(), out Consequence cons); - var count = int.Parse(row.ItemArray[2].ToString()); - foreach(var replacedRow in matchingReplaced) + var itemValidators = _validator.ItemValidators; + foreach (var itemValidator in itemValidators) { - foreach(var validator in _validator.ItemValidators.Where(iv => iv.PrimaryConstraint.Consequence == cons)) //only use validators with matching consequences + var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + if(result.SourceConstraint.Consequence == cons) { - var cols = replaced.Columns.Cast().Where(c => c.ColumnName != validator.TargetProperty); - var result = validator.ValidateAll(replacedRow[validator.TargetProperty], cols.ToArray(), cols.Select(c => c.ColumnName).ToArray()); - if (result != null) - { - count -= 1; - } - var res = validator;//.ValidateAll(row[validator.PrimaryConstraint.col]) - + count -= 1; } } - - for (int i = 0; i < count; i++) - { - //TODO this is where the periodicitystates are generated from - newByPivotCategoryCubesOverTime[category].IncrementHyperCube(DateTime.Parse(row.ItemArray[1].ToString()).Year, DateTime.Parse(row.ItemArray[1].ToString()).Month, cons); - } } + + for (var i = 0; i < count; i++) + { + periodicityCube.IncrementHyperCube(year, month, cons); + } } - foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) - { - periodicity.CommitToDatabase(evaluation); - } + periodicityCube.CommitToDatabase(evaluation); + } + + foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) + { + state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); } + //foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) + //{ + // periodicity.CommitToDatabase(evaluation); + //} + + ////mark down that we are beginning an evaluation on this the day of our lord etc... + //var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); + //var evaluation = new Evaluation(dqeRepository, _catalogue); + + ////find entries that have been put in the archive + ////is this the correct table info? + //var tableInfo = _catalogue.CatalogueItems.First().ColumnInfo.TableInfo; + //var dataDiffFetcher = new DiffDatabaseDataFetcher(9, tableInfo, (int)_dataLoadID, 50000); + //dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); + //var replaced = dataDiffFetcher.Updates_Replaced; //all the stuff that has been replaced by the new data load + + //var previousReportBuilder = new ReportBuilder(_catalogue, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, replaced); + //previousReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + //var previousRows = previousReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); + //var previousColumns = previousReportBuilder.GetByPivotCategoryCubesOverTime(); + + //var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); + + //foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) + //{ + // state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); + //} + + //foreach (var rowState in previousEvaluation.RowStates) + //{ + // var correct = rowState.Correct; + // var missing = rowState.Missing; + // var wrong = rowState.Wrong; + // var invalid = rowState.Invalid; + // if (replaced.AsEnumerable().Any() && previousRows.TryGetValue(rowState.PivotCategory, out var pivotCategoryRow)) + // { + // var oldCorrect = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[0]; + // var oldMissing = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Missing]; + // var oldWrong = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Wrong]; + // var oldInvalid = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.InvalidatesRow]; + // correct -= oldCorrect; + // missing -= oldMissing; + // wrong -= oldWrong; + // invalid -= oldInvalid; + // Console.WriteLine("1"); + // } + + // if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; + + // _ = new RowState(evaluation, rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); + //} + //foreach (var columnState in previousEvaluation.ColumnStates) + //{ + // var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) + // { + // CountMissing = columnState.CountMissing, + // CountWrong = columnState.CountWrong, + // CountInvalidatesRow = columnState.CountInvalidatesRow, + // CountCorrect = columnState.CountCorrect, + // CountDBNull = columnState.CountDBNull + // }; + // var x = previousRows.TryGetValue(columnState.PivotCategory, out var pivotCategoryRow); + // if (pivotCategoryRow != null) + // { + // //they all seem to be dataLoadId [0], but should check this is true + // //pivotCategoryRow.AllColumnStates.TryGetValue(columnState.DataLoadRunID, out var allcolumnStates); + // pivotCategoryRow.AllColumnStates.TryGetValue(0, out var allcolumnStates); + // if (allcolumnStates is not null) + // { + // //var allcolumnStates = pivotCategoryRow.AllColumnStates[columnState.DataLoadRunID]; + // var col = allcolumnStates.Where(c => c.TargetProperty == columnState.TargetProperty).FirstOrDefault(); + // if (col is not null) + // { + // //cs.CountMissing -= col.CountMissing; + // cs.CountMissing = col.CountMissing - cs.CountMissing; + // cs.CountWrong -= col.CountWrong; + // cs.CountInvalidatesRow -= col.CountInvalidatesRow; + // cs.CountCorrect -= col.CountCorrect; + // cs.CountDBNull -= col.CountDBNull; + // } + // } + // } + + + // if (replaced.AsEnumerable().Any() && previousColumns.TryGetValue(columnState.PivotCategory, out var pivotCategoryColumns)) + // { + // var y = pivotCategoryColumns; + // } + // if (cs.CountCorrect < 1 && cs.CountMissing < 1 && cs.CountWrong < 1 && cs.CountInvalidatesRow < 1) continue; + // cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); + //} + + //if (_timePeriodicityField != null) + //{ + // var categories = previousEvaluation.RowStates.Select(r => r.PivotCategory).Distinct().ToList(); + // foreach (var category in categories) + // { + // var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + // var matchingReplaced = replaced.AsEnumerable(); + // if (category != "ALL") + // { + // matchingReplaced = matchingReplaced.Where(row => row[pivotColumn].ToString() == category); + // } + // //do we validate the row here... + // //var x = _validator.ItemValidators; + // //foreach(var validator in _validator.ItemValidators) + // //{ + // // validator.ValidateAll() + // //} + // if (!newByPivotCategoryCubesOverTime.ContainsKey(category)) + // { + // newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + // } + // //exists, just add + // foreach (var row in periodicityDT.AsEnumerable()) + // { + // Enum.TryParse(row.ItemArray[3].ToString(), out Consequence cons); + // var count = int.Parse(row.ItemArray[2].ToString()); + // foreach(var replacedRow in matchingReplaced) + // { + // foreach(var validator in _validator.ItemValidators.Where(iv => iv.PrimaryConstraint.Consequence == cons)) //only use validators with matching consequences + // { + // var cols = replaced.Columns.Cast().Where(c => c.ColumnName != validator.TargetProperty); + // var result = validator.ValidateAll(replacedRow[validator.TargetProperty], cols.ToArray(), cols.Select(c => c.ColumnName).ToArray()); + // if (result != null) + // { + // count -= 1; + // } + // var res = validator;//.ValidateAll(row[validator.PrimaryConstraint.col]) + + // } + // } + + // for (int i = 0; i < count; i++) + // { + // //TODO this is where the periodicitystates are generated from + // newByPivotCategoryCubesOverTime[category].IncrementHyperCube(DateTime.Parse(row.ItemArray[1].ToString()).Year, DateTime.Parse(row.ItemArray[1].ToString()).Month, cons); + // } + // } + + // } + // foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) + // { + // periodicity.CommitToDatabase(evaluation); + // } + //} dqeRepository.EndTransactedConnection(true); } catch (Exception) From 7115f441d0f2cc7a9891d6636c5a698118f98020 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 18 Nov 2024 09:18:23 +0000 Subject: [PATCH 14/35] interim --- .../WindowManagement/ActivateItems.cs | 2 +- .../DataQualityEngine/Reports/CatalogueConstraintReport.cs | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs b/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs index 9b3609bbe8..11fcb75d81 100644 --- a/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs +++ b/Application/ResearchDataManagementPlatform/WindowManagement/ActivateItems.cs @@ -386,7 +386,7 @@ private T Activate(T2 databaseObject, Image tabImage) uiInstance.SetDatabaseObject(this, databaseObject); - if (insertIndex is not null) + if (insertIndex is not null && _mainDockPanel.ActivePane is not null) { _mainDockPanel.ActivePane.SetContentIndex(floatable, (int)insertIndex); } diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 95373655eb..303aeb5119 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -227,7 +227,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li { var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); var evaluation = new Evaluation(dqeRepository, _catalogue); - //what about periodicityValues? var tableInfo = _catalogue.CatalogueItems.First().ColumnInfo.TableInfo; var dataDiffFetcher = new DiffDatabaseDataFetcher(10000000, tableInfo, (int)_dataLoadID, 50000);//todo update these numbers dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); @@ -275,6 +274,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; evaluation.AddRowState(rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } + + //column state is wrong foreach (var columnState in previousEvaluation.ColumnStates) { var dbNull = columnState.CountDBNull; @@ -328,6 +329,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); } + + //pivot category is wrong var categories = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).ToList().Distinct(); foreach (var category in categories) { From 9cc351b2e058827af15e9dd2fcb2d3a9e333c913 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 18 Nov 2024 14:59:50 +0000 Subject: [PATCH 15/35] attempt works --- .../Reports/CatalogueConstraintReport.cs | 198 ++++++++++++++---- 1 file changed, 155 insertions(+), 43 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 303aeb5119..38a9eb200c 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -16,6 +16,7 @@ using FAnsi.Discovery; using MongoDB.Driver; using NPOI.OpenXmlFormats.Vml; +using NPOI.SS.Formula.Functions; using NPOI.Util; using Org.BouncyCastle.Security.Certificates; using Rdmp.Core.Curation.Data; @@ -274,8 +275,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; evaluation.AddRowState(rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); } - - //column state is wrong + foreach (var columnState in previousEvaluation.ColumnStates) { var dbNull = columnState.CountDBNull; @@ -316,6 +316,68 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li correct -= 1; //remove a correct entry } } + //if(columnState.PivotCategory == "ALL") + //{ + // var inserts = dataDiffFetcher.Inserts; + // foreach (var insert in inserts.AsEnumerable()) + // { + // var itemValidators = _validator.ItemValidators.Where(iv => iv.TargetProperty == columnState.TargetProperty); + // if (itemValidators.Any()) + // { + // foreach (var itemValidator in itemValidators) + // { + // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + // var result = itemValidator.ValidateAll(insert[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + // if (result.SourceConstraint.Consequence == Consequence.Missing) + // { + // missing += 1; + // } + // else if (result.SourceConstraint.Consequence == Consequence.Wrong) + // { + // wrong += 1; + // } + // else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) + // { + // invalid += 1; + // } + // } + // } + // else + // { + // correct += 1; //remove a correct entry + // } + // } + // var newUpdates = dataDiffFetcher.Updates_New; + // //foreach (var newUpdate in newUpdates.AsEnumerable()) + // //{ + // // var itemValidators = _validator.ItemValidators.Where(iv => iv.TargetProperty == columnState.TargetProperty); + // // if (itemValidators.Any()) + // // { + // // foreach (var itemValidator in itemValidators) + // // { + // // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + // // var result = itemValidator.ValidateAll(newUpdate[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + // // if (result.SourceConstraint.Consequence == Consequence.Missing) + // // { + // // missing += 1; + // // } + // // else if (result.SourceConstraint.Consequence == Consequence.Wrong) + // // { + // // wrong += 1; + // // } + // // else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) + // // { + // // invalid += 1; + // // } + // // } + // // } + // // else + // // { + // // //correct += 1; //remove a correct entry + // // } + // //} + //} + ////probably want to use the inserts & updates new to add to the columnstate for ALL if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) @@ -330,58 +392,108 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } - //pivot category is wrong - var categories = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).ToList().Distinct(); - foreach (var category in categories) + + //foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) + // state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); + var currentEvalCategories = newByPivotCategoryCubesOverTime.Keys.ToList(); + var categoriesThatHaveGoneMissing = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).Where(pc => !currentEvalCategories.Contains(pc)).ToList().Distinct(); + foreach (var category in categoriesThatHaveGoneMissing) { - //this is working well, but it's not decresing the periodicityState when the value is replaced - var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - var periodicityCube = new PeriodicityCubesOverTime(category); - newByPivotCategoryCubesOverTime.TryGetValue(category, out PeriodicityCubesOverTime value); - if (value is not null) - { - periodicityCube = value; - } - foreach (var row in previousPeriodicity.AsEnumerable()) + var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + foreach (var row in periodicityDT.AsEnumerable()) { var year = DateTime.Parse(row["YearMonth"].ToString()).Year; var month = DateTime.Parse(row["YearMonth"].ToString()).Month; - _ = Enum.TryParse(row["RowEvaluation"].ToString(), out Consequence cons); + var worseConsequence = row["RowEvaluation"].ToString(); + _ = Enum.TryParse(worseConsequence, out Consequence cons); - var count = int.Parse(row["CountOfRecords"].ToString()); - var matchingReplacements = replaced.AsEnumerable(); - if (category != "ALL") + var count = 0; + while (count < int.Parse(row["CountOfRecords"].ToString())) { - matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == category && DateTime.Parse(row[timeColumn].ToString()).Year == year && DateTime.Parse(row[timeColumn].ToString()).Month == month); + newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); + newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); + count++; } + } - foreach (var replacement in matchingReplacements) - { - var itemValidators = _validator.ItemValidators; - foreach (var itemValidator in itemValidators) - { - var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - if(result.SourceConstraint.Consequence == cons) - { - count -= 1; - } - } - } + } + if (_timePeriodicityField != null) + foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) + periodicity.CommitToDatabase(evaluation); + //pivot category is wrong + //var categories = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).ToList().Distinct(); + //foreach (var category in categories) + //{ + // //this is working well, but it's not decresing the periodicityState when the value is replaced + // var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + // var periodicityCube = new PeriodicityCubesOverTime(category); + // newByPivotCategoryCubesOverTime.TryGetValue(category, out PeriodicityCubesOverTime value); + // if (value is not null) + // { + // periodicityCube = value; + // } + // foreach (var row in previousPeriodicity.AsEnumerable()) + // { + // var year = DateTime.Parse(row["YearMonth"].ToString()).Year; + // var month = DateTime.Parse(row["YearMonth"].ToString()).Month; + // _ = Enum.TryParse(row["RowEvaluation"].ToString(), out Consequence cons); - for (var i = 0; i < count; i++) - { - periodicityCube.IncrementHyperCube(year, month, cons); - } - } - periodicityCube.CommitToDatabase(evaluation); - } + // var count = int.Parse(row["CountOfRecords"].ToString()); + // var matchingReplacements = replaced.AsEnumerable(); + // if (category != "ALL") + // { + // matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == category && DateTime.Parse(row[timeColumn].ToString()).Year == year && DateTime.Parse(row[timeColumn].ToString()).Month == month); + // } - foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) - { - state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); - } + // foreach (var replacement in matchingReplacements) + // { + // var itemValidators = _validator.ItemValidators; + // foreach (var itemValidator in itemValidators) + // { + // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + // var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + // if (result is not null && result.SourceConstraint.Consequence == cons) + // { + // count -= 1; + // break; + // } + // } + // } + + // for (var i = 0; i < count; i++) + // { + // periodicityCube.IncrementHyperCube(year, month, cons); + // } + // } + + // foreach (var update in dataDiffFetcher.Updates_New.AsEnumerable()) + // { + // var itemValidators = _validator.ItemValidators; + // var year = DateTime.Parse(update[timeColumn].ToString()).Year; + // var month = DateTime.Parse(update[timeColumn].ToString()).Month; + // foreach (var itemValidator in itemValidators) + // { + // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + // var result = itemValidator.ValidateAll(update[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + // //if (result is not null && result.SourceConstraint.Consequence == cons) + // //{ + // // count += 1; + // //} + // if (result is not null) + // { + // periodicityCube.IncrementHyperCube(year, month, result.SourceConstraint.Consequence); + // } + // } + // } + // periodicityCube.CommitToDatabase(evaluation); + //} + + //foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) + //{ + //state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); + //} //foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) //{ // periodicity.CommitToDatabase(evaluation); From 3ef391fdff5deab3198e2e4f2d10be3e59b017c1 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 18 Nov 2024 15:57:43 +0000 Subject: [PATCH 16/35] fix row state --- .../DataQualityEngine/Reports/CatalogueConstraintReport.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 38a9eb200c..11337b7a29 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -392,9 +392,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } - - //foreach (var state in byPivotRowStatesOverDataLoadRunId.Values) - // state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); var currentEvalCategories = newByPivotCategoryCubesOverTime.Keys.ToList(); var categoriesThatHaveGoneMissing = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).Where(pc => !currentEvalCategories.Contains(pc)).ToList().Distinct(); foreach (var category in categoriesThatHaveGoneMissing) @@ -418,6 +415,9 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } } + foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) + state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); + if (_timePeriodicityField != null) foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) periodicity.CommitToDatabase(evaluation); From 558e89c86d1e209f5d0f1111d172d7d22fd482b2 Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 19 Nov 2024 07:42:18 +0000 Subject: [PATCH 17/35] tidy up --- .../Reports/CatalogueConstraintReport.cs | 293 +----------------- 1 file changed, 12 insertions(+), 281 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 11337b7a29..62a464de58 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -297,7 +297,11 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li { var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - if (result.SourceConstraint.Consequence == Consequence.Missing) + if(result is null) + { + correct -= 1; + } + else if (result.SourceConstraint.Consequence == Consequence.Missing) { missing -= 1; } @@ -316,68 +320,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li correct -= 1; //remove a correct entry } } - //if(columnState.PivotCategory == "ALL") - //{ - // var inserts = dataDiffFetcher.Inserts; - // foreach (var insert in inserts.AsEnumerable()) - // { - // var itemValidators = _validator.ItemValidators.Where(iv => iv.TargetProperty == columnState.TargetProperty); - // if (itemValidators.Any()) - // { - // foreach (var itemValidator in itemValidators) - // { - // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - // var result = itemValidator.ValidateAll(insert[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - // if (result.SourceConstraint.Consequence == Consequence.Missing) - // { - // missing += 1; - // } - // else if (result.SourceConstraint.Consequence == Consequence.Wrong) - // { - // wrong += 1; - // } - // else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) - // { - // invalid += 1; - // } - // } - // } - // else - // { - // correct += 1; //remove a correct entry - // } - // } - // var newUpdates = dataDiffFetcher.Updates_New; - // //foreach (var newUpdate in newUpdates.AsEnumerable()) - // //{ - // // var itemValidators = _validator.ItemValidators.Where(iv => iv.TargetProperty == columnState.TargetProperty); - // // if (itemValidators.Any()) - // // { - // // foreach (var itemValidator in itemValidators) - // // { - // // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - // // var result = itemValidator.ValidateAll(newUpdate[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - // // if (result.SourceConstraint.Consequence == Consequence.Missing) - // // { - // // missing += 1; - // // } - // // else if (result.SourceConstraint.Consequence == Consequence.Wrong) - // // { - // // wrong += 1; - // // } - // // else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) - // // { - // // invalid += 1; - // // } - // // } - // // } - // // else - // // { - // // //correct += 1; //remove a correct entry - // // } - // //} - //} - ////probably want to use the inserts & updates new to add to the columnstate for ALL if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) @@ -394,8 +336,15 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var currentEvalCategories = newByPivotCategoryCubesOverTime.Keys.ToList(); var categoriesThatHaveGoneMissing = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).Where(pc => !currentEvalCategories.Contains(pc)).ToList().Distinct(); + //make sure they weren't in replaced... foreach (var category in categoriesThatHaveGoneMissing) { + var prevousCount = previousEvaluation.RowStates.Where(rs => rs.PivotCategory == category).Count(); + var replacedCount = replaced.AsEnumerable().Where(r => r[pivotColumn].ToString() == category).Count(); + if (prevousCount == replacedCount) + { + continue; + } var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); foreach (var row in periodicityDT.AsEnumerable()) @@ -422,224 +371,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) periodicity.CommitToDatabase(evaluation); - //pivot category is wrong - //var categories = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).ToList().Distinct(); - //foreach (var category in categories) - //{ - // //this is working well, but it's not decresing the periodicityState when the value is replaced - // var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - // var periodicityCube = new PeriodicityCubesOverTime(category); - // newByPivotCategoryCubesOverTime.TryGetValue(category, out PeriodicityCubesOverTime value); - // if (value is not null) - // { - // periodicityCube = value; - // } - // foreach (var row in previousPeriodicity.AsEnumerable()) - // { - // var year = DateTime.Parse(row["YearMonth"].ToString()).Year; - // var month = DateTime.Parse(row["YearMonth"].ToString()).Month; - // _ = Enum.TryParse(row["RowEvaluation"].ToString(), out Consequence cons); - - // var count = int.Parse(row["CountOfRecords"].ToString()); - // var matchingReplacements = replaced.AsEnumerable(); - // if (category != "ALL") - // { - // matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == category && DateTime.Parse(row[timeColumn].ToString()).Year == year && DateTime.Parse(row[timeColumn].ToString()).Month == month); - // } - - // foreach (var replacement in matchingReplacements) - // { - // var itemValidators = _validator.ItemValidators; - // foreach (var itemValidator in itemValidators) - // { - // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - // var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - // if (result is not null && result.SourceConstraint.Consequence == cons) - // { - // count -= 1; - // break; - // } - // } - // } - - // for (var i = 0; i < count; i++) - // { - // periodicityCube.IncrementHyperCube(year, month, cons); - // } - // } - - // foreach (var update in dataDiffFetcher.Updates_New.AsEnumerable()) - // { - // var itemValidators = _validator.ItemValidators; - // var year = DateTime.Parse(update[timeColumn].ToString()).Year; - // var month = DateTime.Parse(update[timeColumn].ToString()).Month; - // foreach (var itemValidator in itemValidators) - // { - // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - // var result = itemValidator.ValidateAll(update[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - // //if (result is not null && result.SourceConstraint.Consequence == cons) - // //{ - // // count += 1; - // //} - // if (result is not null) - // { - // periodicityCube.IncrementHyperCube(year, month, result.SourceConstraint.Consequence); - // } - // } - // } - // periodicityCube.CommitToDatabase(evaluation); - //} - - //foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) - //{ - //state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); - //} - //foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) - //{ - // periodicity.CommitToDatabase(evaluation); - //} - - ////mark down that we are beginning an evaluation on this the day of our lord etc... - //var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); - //var evaluation = new Evaluation(dqeRepository, _catalogue); - - ////find entries that have been put in the archive - ////is this the correct table info? - //var tableInfo = _catalogue.CatalogueItems.First().ColumnInfo.TableInfo; - //var dataDiffFetcher = new DiffDatabaseDataFetcher(9, tableInfo, (int)_dataLoadID, 50000); - //dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); - //var replaced = dataDiffFetcher.Updates_Replaced; //all the stuff that has been replaced by the new data load - - //var previousReportBuilder = new ReportBuilder(_catalogue, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, replaced); - //previousReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); - //var previousRows = previousReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); - //var previousColumns = previousReportBuilder.GetByPivotCategoryCubesOverTime(); - - //var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); - - //foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) - //{ - // state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); - //} - - //foreach (var rowState in previousEvaluation.RowStates) - //{ - // var correct = rowState.Correct; - // var missing = rowState.Missing; - // var wrong = rowState.Wrong; - // var invalid = rowState.Invalid; - // if (replaced.AsEnumerable().Any() && previousRows.TryGetValue(rowState.PivotCategory, out var pivotCategoryRow)) - // { - // var oldCorrect = pivotCategoryRow.RowsPassingValidationByDataLoadRunID[0]; - // var oldMissing = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Missing]; - // var oldWrong = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.Wrong]; - // var oldInvalid = pivotCategoryRow.WorstConsequencesByDataLoadRunID[0][Consequence.InvalidatesRow]; - // correct -= oldCorrect; - // missing -= oldMissing; - // wrong -= oldWrong; - // invalid -= oldInvalid; - // Console.WriteLine("1"); - // } - - // if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; - - // _ = new RowState(evaluation, rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); - //} - //foreach (var columnState in previousEvaluation.ColumnStates) - //{ - // var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) - // { - // CountMissing = columnState.CountMissing, - // CountWrong = columnState.CountWrong, - // CountInvalidatesRow = columnState.CountInvalidatesRow, - // CountCorrect = columnState.CountCorrect, - // CountDBNull = columnState.CountDBNull - // }; - // var x = previousRows.TryGetValue(columnState.PivotCategory, out var pivotCategoryRow); - // if (pivotCategoryRow != null) - // { - // //they all seem to be dataLoadId [0], but should check this is true - // //pivotCategoryRow.AllColumnStates.TryGetValue(columnState.DataLoadRunID, out var allcolumnStates); - // pivotCategoryRow.AllColumnStates.TryGetValue(0, out var allcolumnStates); - // if (allcolumnStates is not null) - // { - // //var allcolumnStates = pivotCategoryRow.AllColumnStates[columnState.DataLoadRunID]; - // var col = allcolumnStates.Where(c => c.TargetProperty == columnState.TargetProperty).FirstOrDefault(); - // if (col is not null) - // { - // //cs.CountMissing -= col.CountMissing; - // cs.CountMissing = col.CountMissing - cs.CountMissing; - // cs.CountWrong -= col.CountWrong; - // cs.CountInvalidatesRow -= col.CountInvalidatesRow; - // cs.CountCorrect -= col.CountCorrect; - // cs.CountDBNull -= col.CountDBNull; - // } - // } - // } - - - // if (replaced.AsEnumerable().Any() && previousColumns.TryGetValue(columnState.PivotCategory, out var pivotCategoryColumns)) - // { - // var y = pivotCategoryColumns; - // } - // if (cs.CountCorrect < 1 && cs.CountMissing < 1 && cs.CountWrong < 1 && cs.CountInvalidatesRow < 1) continue; - // cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); - //} - - //if (_timePeriodicityField != null) - //{ - // var categories = previousEvaluation.RowStates.Select(r => r.PivotCategory).Distinct().ToList(); - // foreach (var category in categories) - // { - // var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - // var matchingReplaced = replaced.AsEnumerable(); - // if (category != "ALL") - // { - // matchingReplaced = matchingReplaced.Where(row => row[pivotColumn].ToString() == category); - // } - // //do we validate the row here... - // //var x = _validator.ItemValidators; - // //foreach(var validator in _validator.ItemValidators) - // //{ - // // validator.ValidateAll() - // //} - // if (!newByPivotCategoryCubesOverTime.ContainsKey(category)) - // { - // newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); - // } - // //exists, just add - // foreach (var row in periodicityDT.AsEnumerable()) - // { - // Enum.TryParse(row.ItemArray[3].ToString(), out Consequence cons); - // var count = int.Parse(row.ItemArray[2].ToString()); - // foreach(var replacedRow in matchingReplaced) - // { - // foreach(var validator in _validator.ItemValidators.Where(iv => iv.PrimaryConstraint.Consequence == cons)) //only use validators with matching consequences - // { - // var cols = replaced.Columns.Cast().Where(c => c.ColumnName != validator.TargetProperty); - // var result = validator.ValidateAll(replacedRow[validator.TargetProperty], cols.ToArray(), cols.Select(c => c.ColumnName).ToArray()); - // if (result != null) - // { - // count -= 1; - // } - // var res = validator;//.ValidateAll(row[validator.PrimaryConstraint.col]) - - // } - // } - - // for (int i = 0; i < count; i++) - // { - // //TODO this is where the periodicitystates are generated from - // newByPivotCategoryCubesOverTime[category].IncrementHyperCube(DateTime.Parse(row.ItemArray[1].ToString()).Year, DateTime.Parse(row.ItemArray[1].ToString()).Month, cons); - // } - // } - - // } - // foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) - // { - // periodicity.CommitToDatabase(evaluation); - // } - //} dqeRepository.EndTransactedConnection(true); } catch (Exception) From 9c75308a0fb474aa431824085ebc30c8b4856fde Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 19 Nov 2024 10:57:41 +0000 Subject: [PATCH 18/35] rethink periodicity --- .../Reports/CatalogueConstraintReport.cs | 142 ++++++++++++++---- 1 file changed, 113 insertions(+), 29 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 62a464de58..6076189dd2 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -297,7 +297,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li { var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - if(result is null) + if (result is null) { correct -= 1; } @@ -334,36 +334,120 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } - var currentEvalCategories = newByPivotCategoryCubesOverTime.Keys.ToList(); - var categoriesThatHaveGoneMissing = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).Where(pc => !currentEvalCategories.Contains(pc)).ToList().Distinct(); - //make sure they weren't in replaced... - foreach (var category in categoriesThatHaveGoneMissing) - { - var prevousCount = previousEvaluation.RowStates.Where(rs => rs.PivotCategory == category).Count(); - var replacedCount = replaced.AsEnumerable().Where(r => r[pivotColumn].ToString() == category).Count(); - if (prevousCount == replacedCount) - { - continue; - } - var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); - foreach (var row in periodicityDT.AsEnumerable()) - { - var year = DateTime.Parse(row["YearMonth"].ToString()).Year; - var month = DateTime.Parse(row["YearMonth"].ToString()).Month; - var worseConsequence = row["RowEvaluation"].ToString(); - _ = Enum.TryParse(worseConsequence, out Consequence cons); - var count = 0; - while (count < int.Parse(row["CountOfRecords"].ToString())) - { - newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); - newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); - count++; - } - } + ////periodicity is scuffed + + //var currentEvalCategories = newByPivotCategoryCubesOverTime.Keys.ToList(); + //var categoriesThatHaveGoneMissing = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).Where(pc => !currentEvalCategories.Contains(pc)).ToList().Distinct(); + ////make sure they weren't in replaced... + //foreach (var category in categoriesThatHaveGoneMissing) + //{ + // var prevousCount = previousEvaluation.RowStates.Where(rs => rs.PivotCategory == category).Count(); + // var replacedCount = replaced.AsEnumerable().Where(r => r[pivotColumn].ToString() == category).Count(); + // if (prevousCount == replacedCount) + // { + // continue; + // } + // var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + // newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + // foreach (var row in periodicityDT.AsEnumerable()) + // { + // var year = DateTime.Parse(row["YearMonth"].ToString()).Year; + // var month = DateTime.Parse(row["YearMonth"].ToString()).Month; + // var worseConsequence = row["RowEvaluation"].ToString(); + // _ = Enum.TryParse(worseConsequence, out Consequence cons); + + // var count = 0; + // while (count < int.Parse(row["CountOfRecords"].ToString())) + // { + // newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); + // newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); + // count++; + // } + // } + //} + ////add values for rows that were in previous eval, but not replaced + //var previousCategories = previousEvaluation.GetPivotCategoryValues(); + //var missingCategories = previousCategories.Where(pc => !currentEvalCategories.Contains(pc) && !categoriesThatHaveGoneMissing.Contains(pc)); + //foreach (var category in missingCategories) + //{ + // var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + // if (periodicityDT == null) continue; + // foreach (var row in periodicityDT.AsEnumerable()) + // { + // var year = DateTime.Parse(row["YearMonth"].ToString()).Year; + // var month = DateTime.Parse(row["YearMonth"].ToString()).Month; + // var worseConsequence = row["RowEvaluation"].ToString(); + // if (worseConsequence == "Correct") worseConsequence = null; + // var matchingReplacements = replaced.AsEnumerable(); + + // if (category != "ALL") + // { + // matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == category); + // } + // var matchesReplacement = false; + // foreach (var replacementRow in matchingReplacements) + // { + // var replacementYear = DateTime.Parse(replacementRow[timeColumn].ToString()).Year; + // var replacementMonth = DateTime.Parse(replacementRow[timeColumn].ToString()).Month; + // string replacementRowEvaluation = null;//todo + // var columnResults = new List(); + // var itemValidators = _validator.ItemValidators.Where(iv => replaced.Columns.Cast().Select(c => c.ColumnName).Contains(iv.TargetProperty)); + // if (itemValidators.Any()) + // { + // foreach (var itemValidator in itemValidators) + // { + // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); + // var result = itemValidator.ValidateAll(replacementRow[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); + // if (result != null) + // { + // columnResults.Add(result.SourceConstraint.Consequence); + // } + // } + // } + // if (columnResults.Any()) + // { + // replacementRowEvaluation = columnResults.OrderByDescending(x => (int)(x)).ToList().First().ToString(); + // } + // if (year != replacementYear) + // { + // continue; + // } + // if (month != replacementMonth) + // { + // continue; + // } + // if (worseConsequence != replacementRowEvaluation) + // { + // continue; + // } + // matchesReplacement = true; + // break; + // } + // if (!matchesReplacement) + // { + // _ = Enum.TryParse(worseConsequence, out Consequence cons); + // if (int.Parse(row["CountOfRecords"].ToString()) > 0) + // { + // if (!newByPivotCategoryCubesOverTime.TryGetValue(category, out var exists)) + // { + // newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + // } + // newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); + // newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); + // } + // } + // } + //} + + //var persistingCategories = previousCategories.Where(pc => currentEvalCategories.Contains(pc)); + //foreach(var category in persistingCategories) + //{ + // var previousPeriodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); + // var currentPeriodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(evaluation, category, false); + // Console.WriteLine("X"); + //} - } foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); From 41dbf8acfda97d6be1ec082ea1fa050f9de8e880 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 16 Dec 2024 12:28:11 +0000 Subject: [PATCH 19/35] row state without all --- .../Reports/CatalogueConstraintReport.cs | 321 +++++------------- 1 file changed, 94 insertions(+), 227 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 6076189dd2..5ec811e5ce 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -222,251 +222,118 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); var newByPivotRowStatesOverDataLoadRunId = reportBuilder.GetByPivotRowStatesOverDataLoadRunId(); var newByPivotCategoryCubesOverTime = reportBuilder.GetByPivotCategoryCubesOverTime(); + + var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); + var timeColumn = c.TimeCoverage_ExtractionInformation.ColumnInfo.GetRuntimeName(); + + var incomingPivotCategories = rDT.AsEnumerable().Select(r => r[pivotColumn].ToString()).ToList(); + using (var con = dqeRepository.BeginNewTransactedConnection()) { - try + var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); + var previousColumnStates = previousEvaluation.ColumnStates; + var previousRowSates = previousEvaluation.RowStates; + var previousCategories = previousEvaluation.GetPivotCategoryValues().Where(c => c != "ALL"); + + var evaluation = new Evaluation(dqeRepository, _catalogue); + + //new pivoutCategories coming in + var newIncomingPivotCategories = incomingPivotCategories.Where(c => !previousCategories.Contains(c)); + + + var pivotColumnInfo = _catalogue.CatalogueItems.Where(ci => ci.Name == _pivotCategory).FirstOrDefault(); + if (pivotColumnInfo is null) throw new Exception("Can't find column infor for pivot category"); + var tableInfo = pivotColumnInfo.ColumnInfo.TableInfo; + var dataDiffFetcher = new DiffDatabaseDataFetcher(10000000, tableInfo, (int)_dataLoadID, 50000);//todo update these numbers + dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); + //pivot categories that have been replaces 100%? + var replacedPivotCategories = previousCategories.Where(c => { - var previousEvaluation = dqeRepository.GetAllObjectsWhere("CatalogueID", _catalogue.ID).LastOrDefault() ?? throw new Exception("No DQE results currently exist"); - var evaluation = new Evaluation(dqeRepository, _catalogue); - var tableInfo = _catalogue.CatalogueItems.First().ColumnInfo.TableInfo; - var dataDiffFetcher = new DiffDatabaseDataFetcher(10000000, tableInfo, (int)_dataLoadID, 50000);//todo update these numbers - dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); - var replaced = dataDiffFetcher.Updates_Replaced; - var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); - var timeColumn = c.TimeCoverage_ExtractionInformation.ColumnInfo.GetRuntimeName(); - foreach (var rowState in previousEvaluation.RowStates) + if (incomingPivotCategories.Contains(c)) return false;//not a total replacement + var replacedCount = dataDiffFetcher.Updates_Replaced.AsEnumerable().Where(r => r[_pivotCategory].ToString() == c).Count(); + var previousRowState = previousRowSates.Where(rs => rs.PivotCategory == c).FirstOrDefault(); + if (previousRowState is null) return false; //did not exist before + var previousEvaluationTotal = previousRowState.Correct + previousRowState.Missing + previousRowState.Wrong + previousRowState.Invalid; + return replacedCount == previousEvaluationTotal; + }); + + // existing pivot categories coming in + var existingIncomingPivotCategories = incomingPivotCategories.Where(c => previousCategories.Contains(c) && !replacedPivotCategories.Contains(c) && c != "ALL"); + + //unchanges categories + foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) + { + //copy row states that have not changes + evaluation.AddRowState(previousRowState.DataLoadRunID, previousRowState.Correct, previousRowState.Missing, previousRowState.Wrong, previousRowState.Invalid, previousRowState.ValidatorXML, previousRowState.PivotCategory, con.Connection, con.Transaction); + } + //new categories + foreach (var newCategory in newIncomingPivotCategories) + { + newByPivotRowStatesOverDataLoadRunId.TryGetValue(newCategory, out DQEStateOverDataLoadRunId incomingState); + incomingState.RowsPassingValidationByDataLoadRunID.TryGetValue((int)_dataLoadID, out int correct); + incomingState.WorstConsequencesByDataLoadRunID.TryGetValue((int)_dataLoadID, out Dictionary results); + results.TryGetValue(Consequence.Missing, out int mising); + results.TryGetValue(Consequence.Wrong, out int wrong); + results.TryGetValue(Consequence.InvalidatesRow, out int invalidatesRow); + evaluation.AddRowState((int)_dataLoadID, correct, mising, wrong, invalidatesRow, _catalogue.ValidatorXML, newCategory, con.Connection, con.Transaction); + } + if (existingIncomingPivotCategories.Any()) + { + //existing row states with new entries + var updatedRowsDataTable = new DataTable(); + var qb = new QueryBuilder(null, ""); + + using (var updateCon = _server.GetConnection()) { - var correct = rowState.Correct; - var missing = rowState.Missing; - var wrong = rowState.Wrong; - var invalid = rowState.Invalid; - var matchingReplacements = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == rowState.DataLoadRunID); - if (rowState.PivotCategory != "ALL") - { - matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == rowState.PivotCategory); - } - foreach (var replacement in matchingReplacements) - { - foreach (var itemValidator in _validator.ItemValidators) - { - var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - if (result is not null) - { - if (result.SourceConstraint.Consequence == Consequence.Missing) - { - missing -= 1; - } - else if (result.SourceConstraint.Consequence == Consequence.Wrong) - { - wrong -= 1; - } - else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) - { - invalid -= 1; - } - } - else - { - correct -= 1; - } - } - } - if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; - evaluation.AddRowState(rowState.DataLoadRunID, correct, missing, wrong, invalid, rowState.ValidatorXML, rowState.PivotCategory, con.Connection, con.Transaction); + updateCon.Open(); + qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); + qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, updateCon); + cmd.CommandTimeout = 500000; + var adapter = _server.GetDataAdapter(cmd); + updatedRowsDataTable.BeginLoadData(); + adapter.Fill(updatedRowsDataTable); + updatedRowsDataTable.EndLoadData(); + updateCon.Close(); } + var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); + updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var updatedByPivotRowStatesOverDataLoadRunId = updatedRowsReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); - foreach (var columnState in previousEvaluation.ColumnStates) + foreach (var updatedCategory in existingIncomingPivotCategories) { - var dbNull = columnState.CountDBNull; - var missing = columnState.CountMissing; - var wrong = columnState.CountWrong; - var invalid = columnState.CountInvalidatesRow; - var correct = columnState.CountCorrect; - var matchingReplacements = replaced.AsEnumerable().Where(row => int.Parse(row[SpecialFieldNames.DataLoadRunID].ToString()) == columnState.DataLoadRunID); - if (columnState.PivotCategory != "ALL") - { - matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == columnState.PivotCategory); - } - foreach (var replacement in matchingReplacements) + updatedByPivotRowStatesOverDataLoadRunId.TryGetValue(updatedCategory, out DQEStateOverDataLoadRunId incomingState); + foreach (var loadId in incomingState.RowsPassingValidationByDataLoadRunID.Keys) { - var itemValidators = _validator.ItemValidators.Where(iv => iv.TargetProperty == columnState.TargetProperty); - if (itemValidators.Any()) - { - foreach (var itemValidator in itemValidators) - { - var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - var result = itemValidator.ValidateAll(replacement[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - if (result is null) - { - correct -= 1; - } - else if (result.SourceConstraint.Consequence == Consequence.Missing) - { - missing -= 1; - } - else if (result.SourceConstraint.Consequence == Consequence.Wrong) - { - wrong -= 1; - } - else if (result.SourceConstraint.Consequence == Consequence.InvalidatesRow) - { - invalid -= 1; - } - } - } - else - { - correct -= 1; //remove a correct entry - } + incomingState.RowsPassingValidationByDataLoadRunID.TryGetValue(loadId, out int _correct); + incomingState.WorstConsequencesByDataLoadRunID.TryGetValue((int)_dataLoadID, out Dictionary results); + results.TryGetValue(Consequence.Missing, out int _missing); + results.TryGetValue(Consequence.Wrong, out int _wrong); + results.TryGetValue(Consequence.InvalidatesRow, out int _invalidatesRow); + evaluation.AddRowState(loadId, _correct, _missing, _wrong, _invalidatesRow, _catalogue.ValidatorXML, updatedCategory, con.Connection, con.Transaction); } - if (correct < 1 && missing < 1 && wrong < 1 && invalid < 1) continue; - - var cs = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) - { - CountMissing = missing, - CountWrong = wrong, - CountInvalidatesRow = invalid, - CountCorrect = correct, - CountDBNull = dbNull - }; - cs.Commit(evaluation, columnState.PivotCategory, con.Connection, con.Transaction); } + } + //row state need to think about ALL & the various data load changes - ////periodicity is scuffed - - //var currentEvalCategories = newByPivotCategoryCubesOverTime.Keys.ToList(); - //var categoriesThatHaveGoneMissing = previousEvaluation.RowStates.Select(rs => rs.PivotCategory).Where(pc => !currentEvalCategories.Contains(pc)).ToList().Distinct(); - ////make sure they weren't in replaced... - //foreach (var category in categoriesThatHaveGoneMissing) - //{ - // var prevousCount = previousEvaluation.RowStates.Where(rs => rs.PivotCategory == category).Count(); - // var replacedCount = replaced.AsEnumerable().Where(r => r[pivotColumn].ToString() == category).Count(); - // if (prevousCount == replacedCount) - // { - // continue; - // } - // var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - // newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); - // foreach (var row in periodicityDT.AsEnumerable()) - // { - // var year = DateTime.Parse(row["YearMonth"].ToString()).Year; - // var month = DateTime.Parse(row["YearMonth"].ToString()).Month; - // var worseConsequence = row["RowEvaluation"].ToString(); - // _ = Enum.TryParse(worseConsequence, out Consequence cons); - - // var count = 0; - // while (count < int.Parse(row["CountOfRecords"].ToString())) - // { - // newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); - // newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); - // count++; - // } - // } - //} - ////add values for rows that were in previous eval, but not replaced - //var previousCategories = previousEvaluation.GetPivotCategoryValues(); - //var missingCategories = previousCategories.Where(pc => !currentEvalCategories.Contains(pc) && !categoriesThatHaveGoneMissing.Contains(pc)); - //foreach (var category in missingCategories) - //{ - // var periodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - // if (periodicityDT == null) continue; - // foreach (var row in periodicityDT.AsEnumerable()) - // { - // var year = DateTime.Parse(row["YearMonth"].ToString()).Year; - // var month = DateTime.Parse(row["YearMonth"].ToString()).Month; - // var worseConsequence = row["RowEvaluation"].ToString(); - // if (worseConsequence == "Correct") worseConsequence = null; - // var matchingReplacements = replaced.AsEnumerable(); - - // if (category != "ALL") - // { - // matchingReplacements = matchingReplacements.Where(row => row[pivotColumn].ToString() == category); - // } - // var matchesReplacement = false; - // foreach (var replacementRow in matchingReplacements) - // { - // var replacementYear = DateTime.Parse(replacementRow[timeColumn].ToString()).Year; - // var replacementMonth = DateTime.Parse(replacementRow[timeColumn].ToString()).Month; - // string replacementRowEvaluation = null;//todo - // var columnResults = new List(); - // var itemValidators = _validator.ItemValidators.Where(iv => replaced.Columns.Cast().Select(c => c.ColumnName).Contains(iv.TargetProperty)); - // if (itemValidators.Any()) - // { - // foreach (var itemValidator in itemValidators) - // { - // var columns = replaced.Columns.Cast().Where(c => c.ColumnName != itemValidator.TargetProperty).ToArray(); - // var result = itemValidator.ValidateAll(replacementRow[itemValidator.TargetProperty], columns, columns.Select(c => c.ColumnName).ToArray()); - // if (result != null) - // { - // columnResults.Add(result.SourceConstraint.Consequence); - // } - // } - // } - // if (columnResults.Any()) - // { - // replacementRowEvaluation = columnResults.OrderByDescending(x => (int)(x)).ToList().First().ToString(); - // } - // if (year != replacementYear) - // { - // continue; - // } - // if (month != replacementMonth) - // { - // continue; - // } - // if (worseConsequence != replacementRowEvaluation) - // { - // continue; - // } - // matchesReplacement = true; - // break; - // } - // if (!matchesReplacement) - // { - // _ = Enum.TryParse(worseConsequence, out Consequence cons); - // if (int.Parse(row["CountOfRecords"].ToString()) > 0) - // { - // if (!newByPivotCategoryCubesOverTime.TryGetValue(category, out var exists)) - // { - // newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); - // } - // newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); - // newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(year, month, worseConsequence == "Correct" ? null : cons); - // } - // } - // } - //} - - //var persistingCategories = previousCategories.Where(pc => currentEvalCategories.Contains(pc)); - //foreach(var category in persistingCategories) - //{ - // var previousPeriodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, category, false); - // var currentPeriodicityDT = PeriodicityState.GetPeriodicityForDataTableForEvaluation(evaluation, category, false); - // Console.WriteLine("X"); - //} - - foreach (var state in newByPivotRowStatesOverDataLoadRunId.Values) - state.CommitToDatabase(evaluation, _catalogue, con.Connection, con.Transaction); - if (_timePeriodicityField != null) - foreach (var periodicity in newByPivotCategoryCubesOverTime.Values) - periodicity.CommitToDatabase(evaluation); - dqeRepository.EndTransactedConnection(true); - } - catch (Exception) - { - dqeRepository.EndTransactedConnection(false); - throw; - } + //var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, false); + + + + var cs = new ColumnState("chi", (int)_dataLoadID, _catalogue.ValidatorXML); + cs.Commit(evaluation, "c", con.Connection, con.Transaction); + + dqeRepository.EndTransactedConnection(true); + } forker.OnNotify(this, - new NotifyEventArgs(ProgressEventType.Information, - "CatalogueConstraintReport completed successfully and committed results to DQE server")); + new NotifyEventArgs(ProgressEventType.Information, + "CatalogueConstraintReport completed successfully and committed results to DQE server")); } catch (Exception e) { From ad1f0776684318a26cfb80ad1c9b96bcb26b4686 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 16 Dec 2024 14:37:26 +0000 Subject: [PATCH 20/35] working row states --- Rdmp.Core/DataQualityEngine/Data/RowState.cs | 11 ++++++ .../Reports/CatalogueConstraintReport.cs | 35 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/Rdmp.Core/DataQualityEngine/Data/RowState.cs b/Rdmp.Core/DataQualityEngine/Data/RowState.cs index 1ccf05bf0b..0b600039ff 100644 --- a/Rdmp.Core/DataQualityEngine/Data/RowState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/RowState.cs @@ -25,6 +25,17 @@ public class RowState public string PivotCategory { get; private set; } + public RowState(int dataLoadRunID, int correct, int missing, int wrong, int invalid, + string validatorXml, string pivotCategory) + { + Correct = correct; + Missing = missing; + Wrong = wrong; + Invalid = invalid; + ValidatorXML = validatorXml; + DataLoadRunID = dataLoadRunID; + } + public RowState(DbDataReader r) { Correct = Convert.ToInt32(r["Correct"]); diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 5ec811e5ce..0bb924e5dd 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -15,6 +15,7 @@ using System.Threading; using FAnsi.Discovery; using MongoDB.Driver; +using NPOI.OpenXmlFormats.Spreadsheet; using NPOI.OpenXmlFormats.Vml; using NPOI.SS.Formula.Functions; using NPOI.Util; @@ -235,6 +236,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var previousRowSates = previousEvaluation.RowStates; var previousCategories = previousEvaluation.GetPivotCategoryValues().Where(c => c != "ALL"); + //var AllStates = previousRowSates.Where(rs => rs.PivotCategory == "ALL"); + var evaluation = new Evaluation(dqeRepository, _catalogue); //new pivoutCategories coming in @@ -276,6 +279,17 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li results.TryGetValue(Consequence.Wrong, out int wrong); results.TryGetValue(Consequence.InvalidatesRow, out int invalidatesRow); evaluation.AddRowState((int)_dataLoadID, correct, mising, wrong, invalidatesRow, _catalogue.ValidatorXML, newCategory, con.Connection, con.Transaction); + //if(!AllStates.Any(state => state.DataLoadRunID == (int)_dataLoadID)) + //{ + // AllStates.Append(new RowState((int)_dataLoadID, correct, mising, wrong, invalidatesRow, _catalogue.ValidatorXML, "ALL")); + //} + //else + //{ + // var current = AllStates.Where(state => state.DataLoadRunID == (int)_dataLoadID).First(); + // var newState = new RowState((int)_dataLoadID, correct+current.Correct, mising + current.Missing, wrong +current.Wrong, invalidatesRow+current.Invalid, _catalogue.ValidatorXML, "ALL"); + // AllStates = AllStates.Where(state => state.DataLoadRunID != (int)_dataLoadID); + // AllStates.Append(newState); + //} } if (existingIncomingPivotCategories.Any()) { @@ -314,6 +328,27 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } } } + List AllStates = new(); + foreach (var rowState in evaluation.RowStates) + { + if (!AllStates.Any(state => state.DataLoadRunID == rowState.DataLoadRunID)) + { + AllStates.Add(new RowState(rowState.DataLoadRunID, rowState.Correct, rowState.Missing, rowState.Wrong, rowState.Invalid, _catalogue.ValidatorXML, "ALL")); + } + else + { + var current = AllStates.Where(state => state.DataLoadRunID == (int)_dataLoadID).First(); + var newState = new RowState(rowState.DataLoadRunID, rowState.Correct + current.Correct, rowState.Missing + current.Missing, rowState.Wrong + current.Wrong, rowState.Invalid + current.Invalid, _catalogue.ValidatorXML, "ALL"); + AllStates = AllStates.Where(state => state.DataLoadRunID != rowState.DataLoadRunID).ToList(); + AllStates.Add(newState); + } + } + //todo need to remove from old ALL data loads and add for this data load + foreach (var state in AllStates) + { + evaluation.AddRowState(state.DataLoadRunID, state.Correct, state.Missing, state.Wrong, state.Invalid, _catalogue.ValidatorXML, "ALL", con.Connection, con.Transaction); + + } //row state need to think about ALL & the various data load changes From dbd7ed4182060a36f3acf647326581f9fea2992b Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 17 Dec 2024 09:02:28 +0000 Subject: [PATCH 21/35] working column state --- .../DataQualityEngine/Data/ColumnState.cs | 3 +- .../Reports/CatalogueConstraintReport.cs | 116 +++++++++++++++--- 2 files changed, 98 insertions(+), 21 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs index 7099c1a74b..12cba851a8 100644 --- a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs @@ -146,8 +146,7 @@ public void Commit(Evaluation evaluation, string pivotCategory, DbConnection con DatabaseCommandHelper.AddParameterWithValueToCommand("@PivotCategory", cmd, pivotCategory); cmd.ExecuteNonQuery(); } - - + IsCommitted = true; } } \ No newline at end of file diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 0bb924e5dd..aa58226b29 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -263,6 +263,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li // existing pivot categories coming in var existingIncomingPivotCategories = incomingPivotCategories.Where(c => previousCategories.Contains(c) && !replacedPivotCategories.Contains(c) && c != "ALL"); + + //* Row States *// //unchanges categories foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) { @@ -279,18 +281,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li results.TryGetValue(Consequence.Wrong, out int wrong); results.TryGetValue(Consequence.InvalidatesRow, out int invalidatesRow); evaluation.AddRowState((int)_dataLoadID, correct, mising, wrong, invalidatesRow, _catalogue.ValidatorXML, newCategory, con.Connection, con.Transaction); - //if(!AllStates.Any(state => state.DataLoadRunID == (int)_dataLoadID)) - //{ - // AllStates.Append(new RowState((int)_dataLoadID, correct, mising, wrong, invalidatesRow, _catalogue.ValidatorXML, "ALL")); - //} - //else - //{ - // var current = AllStates.Where(state => state.DataLoadRunID == (int)_dataLoadID).First(); - // var newState = new RowState((int)_dataLoadID, correct+current.Correct, mising + current.Missing, wrong +current.Wrong, invalidatesRow+current.Invalid, _catalogue.ValidatorXML, "ALL"); - // AllStates = AllStates.Where(state => state.DataLoadRunID != (int)_dataLoadID); - // AllStates.Append(newState); - //} } + //Updates if (existingIncomingPivotCategories.Any()) { //existing row states with new entries @@ -343,25 +335,111 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li AllStates.Add(newState); } } - //todo need to remove from old ALL data loads and add for this data load foreach (var state in AllStates) { evaluation.AddRowState(state.DataLoadRunID, state.Correct, state.Missing, state.Wrong, state.Invalid, _catalogue.ValidatorXML, "ALL", con.Connection, con.Transaction); } + //* Column States *// + List ColumnStates = []; + //unchanged + foreach (var previousColumnState in previousColumnStates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) + { + var cm = new ColumnState(previousColumnState.TargetProperty, previousColumnState.DataLoadRunID, previousColumnState.ItemValidatorXML) + { + CountCorrect = previousColumnState.CountCorrect, + CountMissing = previousColumnState.CountMissing, + CountWrong = previousColumnState.CountWrong, + CountInvalidatesRow = previousColumnState.CountInvalidatesRow, + CountDBNull = previousColumnState.CountDBNull + }; + cm.Commit(evaluation, previousColumnState.PivotCategory, con.Connection, con.Transaction); + ColumnStates.Add(cm); + } + //new stuff + foreach (var newCategory in newIncomingPivotCategories) + { + newByPivotRowStatesOverDataLoadRunId.TryGetValue(newCategory, out DQEStateOverDataLoadRunId incomingState); + incomingState.AllColumnStates.TryGetValue((int)_dataLoadID, out ColumnState[] columnStates); + foreach (var columnState in columnStates) + { + columnState.Commit(evaluation, newCategory, con.Connection, con.Transaction); + ColumnStates.Add(columnState); + } + } + //updates + if (existingIncomingPivotCategories.Any()) + { + var updatedRowsDataTable = new DataTable(); + var qb = new QueryBuilder(null, ""); - //row state need to think about ALL & the various data load changes - + using (var updateCon = _server.GetConnection()) + { + updateCon.Open(); + qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); + qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, updateCon); + cmd.CommandTimeout = 500000; + var adapter = _server.GetDataAdapter(cmd); + updatedRowsDataTable.BeginLoadData(); + adapter.Fill(updatedRowsDataTable); + updatedRowsDataTable.EndLoadData(); + updateCon.Close(); + } + var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); + updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var updatedByPivotRowStatesOverDataLoadRunId = updatedRowsReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); + foreach (var updatedCategory in existingIncomingPivotCategories) + { + updatedByPivotRowStatesOverDataLoadRunId.TryGetValue(updatedCategory, out DQEStateOverDataLoadRunId incomingState); + foreach (var loadId in incomingState.RowsPassingValidationByDataLoadRunID.Keys) + { + incomingState.AllColumnStates.TryGetValue(loadId, out ColumnState[] columnStates); + foreach (var columnState in columnStates) + { + columnState.Commit(evaluation, updatedCategory, con.Connection, con.Transaction); + ColumnStates.Add(columnState); + } + } + } + } + List AllColumns = new(); + foreach (var columnState in ColumnStates) + { + if (!AllColumns.Any(state => state.DataLoadRunID == columnState.DataLoadRunID && state.TargetProperty == columnState.TargetProperty && state.PivotCategory == columnState.PivotCategory)) + { + var cm = new ColumnState(columnState.TargetProperty, columnState.DataLoadRunID, columnState.ItemValidatorXML) + { + CountCorrect = columnState.CountCorrect, + CountMissing = columnState.CountMissing, + CountWrong = columnState.CountWrong, + CountInvalidatesRow = columnState.CountInvalidatesRow, + CountDBNull = columnState.CountDBNull + }; + AllColumns.Add(cm); + } + else + { + var index = AllColumns.FindIndex(state => state.DataLoadRunID == columnState.DataLoadRunID && state.TargetProperty == columnState.TargetProperty && state.PivotCategory == columnState.PivotCategory); + if (index != -1) + { + AllColumns[index].CountCorrect += columnState.CountCorrect; + AllColumns[index].CountMissing += columnState.CountMissing; + AllColumns[index].CountWrong += columnState.CountWrong; + AllColumns[index].CountInvalidatesRow += columnState.CountInvalidatesRow; + AllColumns[index].CountDBNull += columnState.CountDBNull; + } + } + } + foreach (var column in AllColumns) + { + column.Commit(evaluation, "ALL", con.Connection, con.Transaction); + } //var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, false); - - - var cs = new ColumnState("chi", (int)_dataLoadID, _catalogue.ValidatorXML); - cs.Commit(evaluation, "c", con.Connection, con.Transaction); - dqeRepository.EndTransactedConnection(true); } From e8f4f0618a3fe4953b51715e0e03ae50875cddcf Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 17 Dec 2024 10:07:21 +0000 Subject: [PATCH 22/35] confirm rows and columns --- .../Reports/CatalogueConstraintReport.cs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index aa58226b29..25205b358b 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -227,7 +227,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); var timeColumn = c.TimeCoverage_ExtractionInformation.ColumnInfo.GetRuntimeName(); - var incomingPivotCategories = rDT.AsEnumerable().Select(r => r[pivotColumn].ToString()).ToList(); + var incomingPivotCategories = rDT.AsEnumerable().Select(r => r[pivotColumn].ToString()).ToList().Distinct(); using (var con = dqeRepository.BeginNewTransactedConnection()) { @@ -312,7 +312,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li foreach (var loadId in incomingState.RowsPassingValidationByDataLoadRunID.Keys) { incomingState.RowsPassingValidationByDataLoadRunID.TryGetValue(loadId, out int _correct); - incomingState.WorstConsequencesByDataLoadRunID.TryGetValue((int)_dataLoadID, out Dictionary results); + incomingState.WorstConsequencesByDataLoadRunID.TryGetValue(loadId, out Dictionary results); results.TryGetValue(Consequence.Missing, out int _missing); results.TryGetValue(Consequence.Wrong, out int _wrong); results.TryGetValue(Consequence.InvalidatesRow, out int _invalidatesRow); @@ -329,10 +329,13 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } else { - var current = AllStates.Where(state => state.DataLoadRunID == (int)_dataLoadID).First(); - var newState = new RowState(rowState.DataLoadRunID, rowState.Correct + current.Correct, rowState.Missing + current.Missing, rowState.Wrong + current.Wrong, rowState.Invalid + current.Invalid, _catalogue.ValidatorXML, "ALL"); - AllStates = AllStates.Where(state => state.DataLoadRunID != rowState.DataLoadRunID).ToList(); - AllStates.Add(newState); + var current = AllStates.Where(state => state.DataLoadRunID == (int)_dataLoadID).FirstOrDefault(); + if (current is not null) + { + var newState = new RowState(rowState.DataLoadRunID, rowState.Correct + current.Correct, rowState.Missing + current.Missing, rowState.Wrong + current.Wrong, rowState.Invalid + current.Invalid, _catalogue.ValidatorXML, "ALL"); + AllStates = AllStates.Where(state => state.DataLoadRunID != rowState.DataLoadRunID).ToList(); + AllStates.Add(newState); + } } } foreach (var state in AllStates) From 252f16d0417d97d4de1ffca98d0f078603d642a8 Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 17 Dec 2024 10:24:22 +0000 Subject: [PATCH 23/35] actually fix rows --- .../DataQualityEngine/Reports/CatalogueConstraintReport.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 25205b358b..bd4d20ddc5 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -329,7 +329,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } else { - var current = AllStates.Where(state => state.DataLoadRunID == (int)_dataLoadID).FirstOrDefault(); + var current = AllStates.Where(state => state.DataLoadRunID == rowState.DataLoadRunID).FirstOrDefault(); if (current is not null) { var newState = new RowState(rowState.DataLoadRunID, rowState.Correct + current.Correct, rowState.Missing + current.Missing, rowState.Wrong + current.Wrong, rowState.Invalid + current.Invalid, _catalogue.ValidatorXML, "ALL"); From b98a7f842d29163587ace506725087eb9cd88383 Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 17 Dec 2024 11:05:17 +0000 Subject: [PATCH 24/35] add start of periodicity --- .../Reports/CatalogueConstraintReport.cs | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index bd4d20ddc5..9e45afe8ce 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -236,8 +236,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var previousRowSates = previousEvaluation.RowStates; var previousCategories = previousEvaluation.GetPivotCategoryValues().Where(c => c != "ALL"); - //var AllStates = previousRowSates.Where(rs => rs.PivotCategory == "ALL"); - var evaluation = new Evaluation(dqeRepository, _catalogue); //new pivoutCategories coming in @@ -440,6 +438,38 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li column.Commit(evaluation, "ALL", con.Connection, con.Transaction); } + //* Periodicity States *// + + //Unchanged + var x = newByPivotCategoryCubesOverTime; + + var unchangedPivotCategories = previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory)).Select(rs => rs.PivotCategory).Distinct(); foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) ; + foreach (var pivotCategory in unchangedPivotCategories) + { + var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, pivotCategory, false); + foreach (var row in previousPeriodicity.AsEnumerable()) + { + var countOfRecords = int.Parse(row[2].ToString()); + for (var i = 0; i < countOfRecords; i++) { + newByPivotCategoryCubesOverTime.TryGetValue(pivotCategory, out var value); + if(value is null) + { + newByPivotCategoryCubesOverTime[pivotCategory] = new PeriodicityCubesOverTime(pivotCategory); + } + Consequence.TryParse(row[3].ToString(), out Consequence consequence); + var date = DateTime.Parse(row[1].ToString()); + + newByPivotCategoryCubesOverTime[pivotCategory].IncrementHyperCube(date.Year, date.Month, consequence); + newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(date.Year, date.Month, consequence); + } + } + } + //what about the replacements? + //ADD all the new stuff + foreach (var v in newByPivotCategoryCubesOverTime.Values) + { + v.CommitToDatabase(evaluation); + } //var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, false); From 3278c4be70eae1d5bd41df1ee1ff5b320dd30be1 Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 17 Dec 2024 14:58:43 +0000 Subject: [PATCH 25/35] working periodicity --- .../Reports/CatalogueConstraintReport.cs | 116 +++++++++++++++++- .../PeriodicityCubesOverTime.cs | 5 + 2 files changed, 117 insertions(+), 4 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 9e45afe8ce..4762942c87 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -441,7 +441,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li //* Periodicity States *// //Unchanged - var x = newByPivotCategoryCubesOverTime; + newByPivotCategoryCubesOverTime = new();//reset var unchangedPivotCategories = previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory)).Select(rs => rs.PivotCategory).Distinct(); foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) ; foreach (var pivotCategory in unchangedPivotCategories) @@ -450,21 +450,129 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li foreach (var row in previousPeriodicity.AsEnumerable()) { var countOfRecords = int.Parse(row[2].ToString()); - for (var i = 0; i < countOfRecords; i++) { + for (var i = 0; i < countOfRecords; i++) + { newByPivotCategoryCubesOverTime.TryGetValue(pivotCategory, out var value); - if(value is null) + if (value is null) { newByPivotCategoryCubesOverTime[pivotCategory] = new PeriodicityCubesOverTime(pivotCategory); } Consequence.TryParse(row[3].ToString(), out Consequence consequence); var date = DateTime.Parse(row[1].ToString()); - newByPivotCategoryCubesOverTime[pivotCategory].IncrementHyperCube(date.Year, date.Month, consequence); + newByPivotCategoryCubesOverTime.TryGetValue("ALL", out value); + if (value is null) + { + newByPivotCategoryCubesOverTime["ALL"] = new PeriodicityCubesOverTime("ALL"); + } newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(date.Year, date.Month, consequence); } } } //what about the replacements? + if (existingIncomingPivotCategories.Any()) + { + var updatedRowsDataTable = new DataTable(); + var qb = new QueryBuilder(null, ""); + + using (var updateCon = _server.GetConnection()) + { + updateCon.Open(); + qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); + qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, updateCon); + cmd.CommandTimeout = 500000; + var adapter = _server.GetDataAdapter(cmd); + updatedRowsDataTable.BeginLoadData(); + adapter.Fill(updatedRowsDataTable); + updatedRowsDataTable.EndLoadData(); + updateCon.Close(); + } + var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); + updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var cc = updatedRowsReportBuilder.GetByPivotCategoryCubesOverTime(); + foreach (var category in cc.Keys) + { + var hyperCube = cc[category].GetHyperCube(); + foreach (var year in hyperCube.Keys) + { + var periodicityCubes = hyperCube[year]; + foreach (var month in periodicityCubes.Keys) + { + var cube = periodicityCubes[month]; + foreach (var consequence in Enum.GetValues(typeof(Consequence)).Cast().ToList()) + { + var state = cube.GetStateForConsequence(consequence); + for (var i = 0; i < state.CountOfRecords; i++) + { + newByPivotCategoryCubesOverTime.TryGetValue(category, out var value); + if (value is null) + { + newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + } + newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, consequence); + } + + } + } + + } + //want to add this to newByPivotCategoryCubesOverTime + + } + } + //foreach (var newCategory in newIncomingPivotCategories) + if (newIncomingPivotCategories.Any()) + { + var updatedRowsDataTable = new DataTable(); + var qb = new QueryBuilder(null, ""); + + using (var updateCon = _server.GetConnection()) + { + updateCon.Open(); + qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); + qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', newIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); + var cmd = _server.GetCommand(qb.SQL, updateCon); + cmd.CommandTimeout = 500000; + var adapter = _server.GetDataAdapter(cmd); + updatedRowsDataTable.BeginLoadData(); + adapter.Fill(updatedRowsDataTable); + updatedRowsDataTable.EndLoadData(); + updateCon.Close(); + } + var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); + updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); + var cc = updatedRowsReportBuilder.GetByPivotCategoryCubesOverTime(); + foreach (var category in cc.Keys) + { + var hyperCube = cc[category].GetHyperCube(); + foreach (var year in hyperCube.Keys) + { + var periodicityCubes = hyperCube[year]; + foreach (var month in periodicityCubes.Keys) + { + var cube = periodicityCubes[month]; + foreach (var consequence in Enum.GetValues(typeof(Consequence)).Cast().ToList()) + { + var state = cube.GetStateForConsequence(consequence); + for (var i = 0; i < state.CountOfRecords; i++) + { + newByPivotCategoryCubesOverTime.TryGetValue(category, out var value); + if (value is null) + { + newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); + } + newByPivotCategoryCubesOverTime[category].IncrementHyperCube(year, month, consequence); + } + + } + } + + } + //want to add this to newByPivotCategoryCubesOverTime + + } + } //ADD all the new stuff foreach (var v in newByPivotCategoryCubesOverTime.Values) { diff --git a/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs b/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs index c35bbbdbf5..e22476cc58 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/PeriodicityHelpers/PeriodicityCubesOverTime.cs @@ -31,6 +31,11 @@ public PeriodicityCubesOverTime(string pivotCategory) _pivotCategory = pivotCategory; } + public Dictionary> GetHyperCube() + { + return hyperCube; + } + public string GetPivotCategory() { return _pivotCategory; From cd0df1b4cfc40395abecbc682c423cef31758187 Mon Sep 17 00:00:00 2001 From: James Friel Date: Tue, 17 Dec 2024 16:29:13 +0000 Subject: [PATCH 26/35] interim --- .../LoadExecution/Components/Runtime/AttacherRuntimeTask.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs b/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs index b77b5d6efb..71e47f4b04 100644 --- a/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs +++ b/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs @@ -11,6 +11,7 @@ using Rdmp.Core.DataLoad.Engine.Attachers; using Rdmp.Core.DataLoad.Engine.Job; using Rdmp.Core.DataLoad.Engine.LoadExecution.Components.Arguments; +using Rdmp.Core.DataLoad.Modules.Attachers; using Rdmp.Core.Repositories; using Rdmp.Core.ReusableLibraryCode.Checks; using Rdmp.Core.ReusableLibraryCode.Progress; @@ -29,6 +30,8 @@ public class AttacherRuntimeTask : RuntimeTask, IMEFRuntimeTask public AttacherRuntimeTask(IProcessTask task, RuntimeArgumentCollection args) : base(task, args) { + + //RequestsExternalDatabaseCreation //All attachers must be marked as mounting stages, and therefore we can pull out the RAW Server and Name var mountingStageArgs = args.StageSpecificArguments; if (mountingStageArgs.LoadStage != LoadStage.Mounting) @@ -38,7 +41,7 @@ public AttacherRuntimeTask(IProcessTask task, RuntimeArgumentCollection args) throw new ArgumentException( $"Path is blank for ProcessTask '{task}' - it should be a class name of type {nameof(IAttacher)}"); - Attacher = MEF.CreateA(ProcessTask.Path); + Attacher = MEF.CreateA(ProcessTask.Path, new object[] { ProcessTask.Path == typeof(RemoteTableWithoutDBCreationAttacher).ToString()?false:true }); SetPropertiesForClass(RuntimeArguments, Attacher); Attacher.Initialize(args.StageSpecificArguments.RootDir, RuntimeArguments.StageSpecificArguments.DbInfo); } From 0bb606766d01e334c1a2c48bd3db537297aa7d27 Mon Sep 17 00:00:00 2001 From: James Friel Date: Thu, 19 Dec 2024 13:34:34 +0000 Subject: [PATCH 27/35] add test --- .../DQEPartialUpdateTests.cs | 431 ++++++++++++++++++ .../Components/Runtime/AttacherRuntimeTask.cs | 2 +- .../Data/PeriodicityState.cs | 1 + 3 files changed, 433 insertions(+), 1 deletion(-) create mode 100644 Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs diff --git a/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs new file mode 100644 index 0000000000..6b2952f7af --- /dev/null +++ b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs @@ -0,0 +1,431 @@ +using NLog; +using NPOI.POIFS.Properties; +using NPOI.SS.Formula.Functions; +using NUnit.Framework; +using Org.BouncyCastle.Tls; +using Rdmp.Core.Curation; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.Curation.Data.DataLoad; +using Rdmp.Core.Curation.Data.Defaults; +using Rdmp.Core.DataFlowPipeline; +using Rdmp.Core.DataLoad; +using Rdmp.Core.DataLoad.Engine.Checks.Checkers; +using Rdmp.Core.DataLoad.Engine.DatabaseManagement.EntityNaming; +using Rdmp.Core.DataLoad.Engine.Job; +using Rdmp.Core.DataLoad.Engine.LoadExecution; +using Rdmp.Core.DataLoad.Engine.LoadProcess; +using Rdmp.Core.DataLoad.Modules.Attachers; +using Rdmp.Core.DataLoad.Modules.DataProvider; +using Rdmp.Core.DataLoad.Modules.Mutilators; +using Rdmp.Core.DataLoad.Triggers; +using Rdmp.Core.DataQualityEngine.Data; +using Rdmp.Core.DataQualityEngine.Reports; +using Rdmp.Core.Logging; +using Rdmp.Core.Repositories; +using Rdmp.Core.ReusableLibraryCode.Checks; +using Rdmp.Core.ReusableLibraryCode.Progress; +using Rdmp.Core.Tests.DataLoad.Engine.Integration; +using System; +using System.Collections.Generic; +using System.Data; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using Tests.Common; +using static Org.BouncyCastle.Math.EC.ECCurve; + +namespace Rdmp.Core.Tests.DataQualityEngine +{ + internal class DQEPartialUpdateTests : DataLoadEngineTestsBase + { + + string validatorXML = "\r\n\r\n \r\n \r\n \r\n Wrong\r\n \r\n chi\r\n \r\n \r\n \r\n time\r\n \r\n \r\n \r\n"; + string fileLocation = Path.GetTempPath(); + string fileName = "SteppedDQEPartialUpdates.csv"; + + [Test] + public void SteppedDQEPartialUpdates() + { + var server = GetCleanedServer(FAnsi.DatabaseType.MicrosoftSQLServer); + + var dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Columns.Add("hic_dataLoadRunID"); + dt.Rows.Add(new object[] { "1111111111", "A", "2024-12-01",10 }); + dt.Rows.Add(new object[] { "1111111112", "A", "2024-11-01",10 }); + + var table = server.CreateTable("PartialToaDQE", dt); + table.CreatePrimaryKey(table.DiscoverColumns().Where(c => c.GetRuntimeName() == "chi").ToArray()); + + var catalogue = new Catalogue(CatalogueRepository, "PartialToaDQE"); + var importer = new TableInfoImporter(CatalogueRepository, table); + importer.DoImport(out var _tableInfo, out var _columnInfos); + foreach (var columnInfo in _columnInfos) + { + var ci = new CatalogueItem(CatalogueRepository, catalogue, columnInfo.GetRuntimeName()); + ci.SaveToDatabase(); + var ei = new ExtractionInformation(CatalogueRepository, ci, columnInfo, ""); + ei.SaveToDatabase(); + } + var dqeRepository = new DQERepository(CatalogueRepository); + + catalogue.ValidatorXML = validatorXML; + catalogue.TimeCoverage_ExtractionInformation_ID = catalogue.GetAllExtractionInformation(ExtractionCategory.Any) + .Single(e => e.GetRuntimeName().Equals("time")).ID; + + catalogue.PivotCategory_ExtractionInformation_ID = catalogue.GetAllExtractionInformation(ExtractionCategory.Any) + .Single(e => e.GetRuntimeName().Equals("value")).ID; + + var report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + var source = new CancellationTokenSource(); + + var listener = new ToMemoryDataLoadEventListener(false); + report.GenerateReport(catalogue, listener, source.Token); + + var lmd = new LoadMetadata(CatalogueRepository, "MyLoad"); + lmd.LocationOfForLoadingDirectory = Path.GetTempPath(); + lmd.LocationOfForArchivingDirectory = Path.GetTempPath(); + lmd.LocationOfExecutablesDirectory = Path.GetTempPath(); + lmd.LocationOfCacheDirectory = Path.GetTempPath(); + lmd.SaveToDatabase(); + var loggingServer = CatalogueRepository.GetDefaultFor(PermissableDefaults.LiveLoggingServer_ID); + var logManager = new Core.Logging.LogManager(loggingServer); + logManager.CreateNewLoggingTaskIfNotExists(lmd.Name); + catalogue.LoggingDataTask = lmd.Name; + catalogue.SaveToDatabase(); + lmd.LinkToCatalogue(catalogue); + + //fetch files + var fetchDataProcessTask = new ProcessTask(CatalogueRepository, lmd, LoadStage.GetFiles); + fetchDataProcessTask.ProcessTaskType = ProcessTaskType.DataProvider; + fetchDataProcessTask.Path = "Rdmp.Core.DataLoad.Modules.DataProvider.ImportFilesDataProvider"; + fetchDataProcessTask.SaveToDatabase(); + + fetchDataProcessTask.CreateArgumentsForClassIfNotExists(); + fetchDataProcessTask.SetArgumentValue("DirectoryPath", fileLocation); + fetchDataProcessTask.SetArgumentValue("FilePattern", fileName); + fetchDataProcessTask.SaveToDatabase(); + + //load file + var attachProcessTask = new ProcessTask(CatalogueRepository, lmd, LoadStage.Mounting); + attachProcessTask.ProcessTaskType = ProcessTaskType.Attacher; + attachProcessTask.Path = "Rdmp.Core.DataLoad.Modules.Attachers.AnySeparatorFileAttacher"; + attachProcessTask.SaveToDatabase(); + attachProcessTask.CreateArgumentsForClassIfNotExists(); + attachProcessTask.SetArgumentValue("Separator", ","); + attachProcessTask.SetArgumentValue("FilePattern", fileName); + attachProcessTask.SetArgumentValue("TableToLoad", _tableInfo); + attachProcessTask.SaveToDatabase(); + + var dqeUpdate = new ProcessTask(CatalogueRepository, lmd, LoadStage.PostLoad); + dqeUpdate.ProcessTaskType = ProcessTaskType.MutilateDataTable; + dqeUpdate.Path = "Rdmp.Core.DataLoad.Modules.Mutilators.DQEPostLoadRunner"; + dqeUpdate.CreateArgumentsForClassIfNotExists(); + dqeUpdate.SaveToDatabase(); + + //first load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "A", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "B", "2024-10-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of first load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + var evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(3)); + CompareEvaluations(evaluations[1], evaluations[2]); + + //second load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "A", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "C", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "D", "2024-10-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of second load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(5)); + CompareEvaluations(evaluations[3], evaluations[4]); + + //third load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "C", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "C", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "A", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "B", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "E", "2024-08-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of third load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(7)); + CompareEvaluations(evaluations[6], evaluations[5]); + + //fourth load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "A", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "A", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "D", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "B", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "C", "2024-08-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of fourth load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList();//.Where(e => e.CatalogueID == catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(9)); + CompareEvaluations(evaluations[8], evaluations[7]); + + //fifth load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "C", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "B", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "D", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "A", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "A", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "A", "2024-08-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of fifth load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(11)); + CompareEvaluations(evaluations[10], evaluations[9]); + + //sixth load + dt = new DataTable(); + dt.Columns.Add("chi"); + dt.Columns.Add("value"); + dt.Columns.Add("time"); + dt.Rows.Add(new string[] { "1111111111", "C", "2024-12-01" }); + dt.Rows.Add(new string[] { "1111111112", "B", "2024-11-01" }); + dt.Rows.Add(new string[] { "1111111113", "B", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111114", "C", "2024-10-01" }); + dt.Rows.Add(new string[] { "1111111115", "D", "2024-09-01" }); + dt.Rows.Add(new string[] { "1111111116", "A", "2024-08-01" }); + dt.Rows.Add(new string[] { "1111111117", "A", "2024-07-01" }); + dt.Rows.Add(new string[] { "1111111118", "B", "2024-06-01" }); + dt.Rows.Add(new string[] { "1111111119", "C", "2024-05-01" }); + dt.Rows.Add(new string[] { "1111111120", "D", "2024-04-01" }); + dt.Rows.Add(new string[] { "1111111121", "E", "2024-03-01" }); + SetupFile(dt); + + PerformLoad(lmd, logManager); + //end of sixth load + report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) + { + ExplicitDQERepository = dqeRepository + }; + + report.Check(ThrowImmediatelyCheckNotifier.Quiet); + report.GenerateReport(catalogue, listener, source.Token); + + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList();//.Where(e => e.CatalogueID == catalogue.ID).ToList(); + Assert.That(evaluations.Count, Is.EqualTo(13)); + CompareEvaluations(evaluations[12], evaluations[11]); + + + Assert.That(true, Is.EqualTo(true)); + + } + + private void SetupFile(DataTable dt) + { + if (File.Exists(Path.Combine(fileLocation, fileName))) + { + File.Delete(Path.Combine(fileLocation, fileName)); + } + var fs = File.Create(Path.Combine(fileLocation, fileName)); + fs.Close(); + var lines = new List() { + string.Join(',',dt.Columns.Cast().Select(c => c.ColumnName)) + }; + foreach (var row in dt.AsEnumerable()) + { + lines.Add(string.Join(',', row.ItemArray.Select(i => i.ToString()))); + } + File.AppendAllLines(Path.Combine(fileLocation, fileName), lines); + } + + private void CompareEvaluations(Evaluation e1, Evaluation e2) + { + Assert.That(e1.ColumnStates.Length, Is.EqualTo(e2.ColumnStates.Length)); + Assert.That(e1.RowStates.Length, Is.EqualTo(e2.RowStates.Length)); + List columnStateDiff = e1.ColumnStates.Except(e2.ColumnStates, new ColumnStateCompare()).ToList(); + Assert.That(columnStateDiff.Count, Is.EqualTo(0)); + columnStateDiff = e2.ColumnStates.Except(e1.ColumnStates, new ColumnStateCompare()).ToList(); + Assert.That(columnStateDiff.Count, Is.EqualTo(0)); + List rowStateDiff = e1.RowStates.Except(e2.RowStates, new RowStateCompare()).ToList(); + Assert.That(rowStateDiff.Count, Is.EqualTo(0)); + rowStateDiff = e2.RowStates.Except(e1.RowStates, new RowStateCompare()).ToList(); + Assert.That(rowStateDiff.Count, Is.EqualTo(0)); + + Assert.That(e1.GetPivotCategoryValues(), Is.EqualTo(e2.GetPivotCategoryValues())); + foreach (var category in e1.GetPivotCategoryValues()) + { + var e1Periodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(e1, category, false); + e1Periodicity.Columns.Remove("Evaluation_ID"); + var e2Periodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(e2, category, false); + e2Periodicity.Columns.Remove("Evaluation_ID"); + var differences = + e1Periodicity.AsEnumerable().Except(e2Periodicity.AsEnumerable(), + DataRowComparer.Default); + Assert.That(differences.Any(), Is.False); + + } + } + + private class ColumnStateCompare : IEqualityComparer + { + public ColumnStateCompare() + { + } + public bool Equals(ColumnState x, ColumnState y) + { + return x.TargetProperty == y.TargetProperty && + x.PivotCategory == y.PivotCategory && + x.CountCorrect == y.CountCorrect && + x.CountMissing == y.CountMissing && + x.CountWrong == y.CountWrong && + x.CountInvalidatesRow == y.CountInvalidatesRow && + x.CountDBNull == y.CountDBNull; + } + public int GetHashCode(T obj) + { + return obj.GetHashCode(); + } + + public int GetHashCode([DisallowNull] ColumnState obj) + { + return 1; + } + } + + private class RowStateCompare : IEqualityComparer + { + public RowStateCompare() + { + } + public bool Equals(RowState x, RowState y) + { + return x.Correct == y.Correct && + x.Missing == y.Missing && + x.Wrong == y.Wrong && + x.Invalid == y.Invalid && + x.PivotCategory == y.PivotCategory; + + } + public int GetHashCode(T obj) + { + return obj.GetHashCode(); + } + + public int GetHashCode([DisallowNull] RowState obj) + { + return 1; + } + } + + + private void PerformLoad(LoadMetadata lmd, Core.Logging.LogManager logManager) + { + var dbConfig = new HICDatabaseConfiguration(lmd, null); + var projectDirectory = SetupLoadDirectory(lmd); + var job = new DataLoadJob(RepositoryLocator, "Go go go!", logManager, lmd, projectDirectory, + ThrowImmediatelyDataLoadEventListener.Quiet, dbConfig); + + new PreExecutionChecker(lmd, dbConfig).Check( + new AcceptAllCheckNotifier()); + + var loadFactory = new HICDataLoadFactory( + lmd, + dbConfig, + new HICLoadConfigurationFlags(), + CatalogueRepository, + logManager + ); + + var exe = loadFactory.Create(ThrowImmediatelyDataLoadEventListener.Quiet); + + var exitCode = exe.Run( + job, + new GracefulCancellationToken()); + + Assert.That(exitCode, Is.EqualTo(ExitCodeType.Success)); + } + + } +} diff --git a/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs b/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs index 71e47f4b04..060cdbac52 100644 --- a/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs +++ b/Rdmp.Core/DataLoad/Engine/LoadExecution/Components/Runtime/AttacherRuntimeTask.cs @@ -41,7 +41,7 @@ public AttacherRuntimeTask(IProcessTask task, RuntimeArgumentCollection args) throw new ArgumentException( $"Path is blank for ProcessTask '{task}' - it should be a class name of type {nameof(IAttacher)}"); - Attacher = MEF.CreateA(ProcessTask.Path, new object[] { ProcessTask.Path == typeof(RemoteTableWithoutDBCreationAttacher).ToString()?false:true }); + Attacher = MEF.CreateA(ProcessTask.Path); SetPropertiesForClass(RuntimeArguments, Attacher); Attacher.Initialize(args.StageSpecificArguments.RootDir, RuntimeArguments.StageSpecificArguments.DbInfo); } diff --git a/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs b/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs index 7636c26deb..a82e7498c5 100644 --- a/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs @@ -7,6 +7,7 @@ using System; using System.Collections.Generic; using System.Data; +using Rdmp.Core.MapsDirectlyToDatabaseTable; using Rdmp.Core.ReusableLibraryCode; using Rdmp.Core.Validation.Constraints; From 05f172172e276bad25414020cef6b954eba01c54 Mon Sep 17 00:00:00 2001 From: James Friel Date: Thu, 19 Dec 2024 13:54:34 +0000 Subject: [PATCH 28/35] add class coumentation --- .../Reports/CatalogueConstraintReport.cs | 31 ++++++++++++------- .../Reports/ReportBuilder.cs | 9 +++++- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 4762942c87..5645130ce8 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -240,6 +240,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li //new pivoutCategories coming in var newIncomingPivotCategories = incomingPivotCategories.Where(c => !previousCategories.Contains(c)); + List ColumnStates = []; var pivotColumnInfo = _catalogue.CatalogueItems.Where(ci => ci.Name == _pivotCategory).FirstOrDefault(); @@ -279,6 +280,13 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li results.TryGetValue(Consequence.Wrong, out int wrong); results.TryGetValue(Consequence.InvalidatesRow, out int invalidatesRow); evaluation.AddRowState((int)_dataLoadID, correct, mising, wrong, invalidatesRow, _catalogue.ValidatorXML, newCategory, con.Connection, con.Transaction); + + incomingState.AllColumnStates.TryGetValue((int)_dataLoadID, out ColumnState[] columnStates); + foreach (var columnState in columnStates) + { + columnState.Commit(evaluation, newCategory, con.Connection, con.Transaction); + ColumnStates.Add(columnState); + } } //Updates if (existingIncomingPivotCategories.Any()) @@ -342,7 +350,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } //* Column States *// - List ColumnStates = []; //unchanged foreach (var previousColumnState in previousColumnStates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) { @@ -358,16 +365,16 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li ColumnStates.Add(cm); } //new stuff - foreach (var newCategory in newIncomingPivotCategories) - { - newByPivotRowStatesOverDataLoadRunId.TryGetValue(newCategory, out DQEStateOverDataLoadRunId incomingState); - incomingState.AllColumnStates.TryGetValue((int)_dataLoadID, out ColumnState[] columnStates); - foreach (var columnState in columnStates) - { - columnState.Commit(evaluation, newCategory, con.Connection, con.Transaction); - ColumnStates.Add(columnState); - } - } + //foreach (var newCategory in newIncomingPivotCategories) + //{ + // newByPivotRowStatesOverDataLoadRunId.TryGetValue(newCategory, out DQEStateOverDataLoadRunId incomingState); + // incomingState.AllColumnStates.TryGetValue((int)_dataLoadID, out ColumnState[] columnStates); + // foreach (var columnState in columnStates) + // { + // columnState.Commit(evaluation, newCategory, con.Connection, con.Transaction); + // ColumnStates.Add(columnState); + // } + //} //updates if (existingIncomingPivotCategories.Any()) { @@ -492,7 +499,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); var cc = updatedRowsReportBuilder.GetByPivotCategoryCubesOverTime(); foreach (var category in cc.Keys) - { + { var hyperCube = cc[category].GetHyperCube(); foreach (var year in hyperCube.Keys) { diff --git a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs index b60e097b2b..954e4a8de2 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs @@ -1,4 +1,8 @@ -using FAnsi.Discovery; +// Copyright (c) The University of Dundee 2024-2024 +// This file is part of the Research Data Management Platform (RDMP). +// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// You should have received a copy of the GNU General Public License along with RDMP. If not, see . using Rdmp.Core.Curation.Data; using Rdmp.Core.DataLoad.Triggers; using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; @@ -20,6 +24,9 @@ namespace Rdmp.Core.DataQualityEngine.Reports; +/// +/// Class used to build cataloge constraint reports +/// public class ReportBuilder { private readonly string _dataLoadRunFieldName; From e0bb3d3e6ba85e4b0b14997ac26c37ec191bf5e3 Mon Sep 17 00:00:00 2001 From: James Friel Date: Thu, 19 Dec 2024 15:03:10 +0000 Subject: [PATCH 29/35] tidy up --- .../Reports/CatalogueConstraintReport.cs | 82 +++++-------------- 1 file changed, 22 insertions(+), 60 deletions(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 5645130ce8..263a134c8f 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -323,6 +323,13 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li results.TryGetValue(Consequence.Wrong, out int _wrong); results.TryGetValue(Consequence.InvalidatesRow, out int _invalidatesRow); evaluation.AddRowState(loadId, _correct, _missing, _wrong, _invalidatesRow, _catalogue.ValidatorXML, updatedCategory, con.Connection, con.Transaction); + + incomingState.AllColumnStates.TryGetValue(loadId, out ColumnState[] columnStates); + foreach (var columnState in columnStates) + { + columnState.Commit(evaluation, updatedCategory, con.Connection, con.Transaction); + ColumnStates.Add(columnState); + } } } } @@ -364,54 +371,6 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li cm.Commit(evaluation, previousColumnState.PivotCategory, con.Connection, con.Transaction); ColumnStates.Add(cm); } - //new stuff - //foreach (var newCategory in newIncomingPivotCategories) - //{ - // newByPivotRowStatesOverDataLoadRunId.TryGetValue(newCategory, out DQEStateOverDataLoadRunId incomingState); - // incomingState.AllColumnStates.TryGetValue((int)_dataLoadID, out ColumnState[] columnStates); - // foreach (var columnState in columnStates) - // { - // columnState.Commit(evaluation, newCategory, con.Connection, con.Transaction); - // ColumnStates.Add(columnState); - // } - //} - //updates - if (existingIncomingPivotCategories.Any()) - { - var updatedRowsDataTable = new DataTable(); - var qb = new QueryBuilder(null, ""); - - using (var updateCon = _server.GetConnection()) - { - updateCon.Open(); - qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); - qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); - var cmd = _server.GetCommand(qb.SQL, updateCon); - cmd.CommandTimeout = 500000; - var adapter = _server.GetDataAdapter(cmd); - updatedRowsDataTable.BeginLoadData(); - adapter.Fill(updatedRowsDataTable); - updatedRowsDataTable.EndLoadData(); - updateCon.Close(); - } - var updatedRowsReportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, updatedRowsDataTable); - updatedRowsReportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); - var updatedByPivotRowStatesOverDataLoadRunId = updatedRowsReportBuilder.GetByPivotRowStatesOverDataLoadRunId(); - - foreach (var updatedCategory in existingIncomingPivotCategories) - { - updatedByPivotRowStatesOverDataLoadRunId.TryGetValue(updatedCategory, out DQEStateOverDataLoadRunId incomingState); - foreach (var loadId in incomingState.RowsPassingValidationByDataLoadRunID.Keys) - { - incomingState.AllColumnStates.TryGetValue(loadId, out ColumnState[] columnStates); - foreach (var columnState in columnStates) - { - columnState.Commit(evaluation, updatedCategory, con.Connection, con.Transaction); - ColumnStates.Add(columnState); - } - } - } - } List AllColumns = new(); foreach (var columnState in ColumnStates) { @@ -451,27 +410,30 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li newByPivotCategoryCubesOverTime = new();//reset var unchangedPivotCategories = previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory)).Select(rs => rs.PivotCategory).Distinct(); foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) ; + newByPivotCategoryCubesOverTime.TryGetValue("ALL", out var value); + if (value is null) + { + newByPivotCategoryCubesOverTime["ALL"] = new PeriodicityCubesOverTime("ALL"); + } foreach (var pivotCategory in unchangedPivotCategories) { var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, pivotCategory, false); + newByPivotCategoryCubesOverTime.TryGetValue(pivotCategory, out value); + if (value is null) + { + newByPivotCategoryCubesOverTime[pivotCategory] = new PeriodicityCubesOverTime(pivotCategory); + } + foreach (var row in previousPeriodicity.AsEnumerable()) { var countOfRecords = int.Parse(row[2].ToString()); for (var i = 0; i < countOfRecords; i++) { - newByPivotCategoryCubesOverTime.TryGetValue(pivotCategory, out var value); - if (value is null) - { - newByPivotCategoryCubesOverTime[pivotCategory] = new PeriodicityCubesOverTime(pivotCategory); - } + Consequence.TryParse(row[3].ToString(), out Consequence consequence); var date = DateTime.Parse(row[1].ToString()); newByPivotCategoryCubesOverTime[pivotCategory].IncrementHyperCube(date.Year, date.Month, consequence); - newByPivotCategoryCubesOverTime.TryGetValue("ALL", out value); - if (value is null) - { - newByPivotCategoryCubesOverTime["ALL"] = new PeriodicityCubesOverTime("ALL"); - } + newByPivotCategoryCubesOverTime["ALL"].IncrementHyperCube(date.Year, date.Month, consequence); } } @@ -512,7 +474,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var state = cube.GetStateForConsequence(consequence); for (var i = 0; i < state.CountOfRecords; i++) { - newByPivotCategoryCubesOverTime.TryGetValue(category, out var value); + newByPivotCategoryCubesOverTime.TryGetValue(category, out value); if (value is null) { newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); @@ -564,7 +526,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var state = cube.GetStateForConsequence(consequence); for (var i = 0; i < state.CountOfRecords; i++) { - newByPivotCategoryCubesOverTime.TryGetValue(category, out var value); + newByPivotCategoryCubesOverTime.TryGetValue(category, out value); if (value is null) { newByPivotCategoryCubesOverTime[category] = new PeriodicityCubesOverTime(category); From b4ada9f589fdf72de91dbab1924d771e47829ea8 Mon Sep 17 00:00:00 2001 From: James Friel Date: Thu, 19 Dec 2024 15:48:56 +0000 Subject: [PATCH 30/35] add note --- .../DataQualityEngine/Reports/CatalogueConstraintReport.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 263a134c8f..88058d5444 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -186,6 +186,10 @@ e is OperationCanceledException } } + //Notes + // this is technically more efficient than a full DQE, but ot's pretty rubbish for categories with updates as we recalculate for the whole category + //may be worth thinking about how we can keep existing records and modify/add to them depending on what's goin on + public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener listener, CancellationToken cancellationToken) From 2bace40f4d20c99a3eda5f20873b02ec45da8031 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 6 Jan 2025 08:18:44 +0000 Subject: [PATCH 31/35] tidy up --- Rdmp.Core/CommandLine/Options/DqeOptions.cs | 3 ++ Rdmp.Core/CommandLine/Runners/DqeRunner.cs | 2 +- .../Modules/Mutilators/DQEPostLoadRunner.cs | 5 +- .../DataQualityEngine/Data/ColumnState.cs | 3 +- .../Data/PeriodicityState.cs | 1 - .../Reports/CatalogueConstraintReport.cs | 54 +++++++------------ .../Reports/ReportBuilder.cs | 18 +++---- .../QueryBuilding/CohortQueryBuilderResult.cs | 1 - 8 files changed, 36 insertions(+), 51 deletions(-) diff --git a/Rdmp.Core/CommandLine/Options/DqeOptions.cs b/Rdmp.Core/CommandLine/Options/DqeOptions.cs index b3e89a4425..fa5dabe5f7 100644 --- a/Rdmp.Core/CommandLine/Options/DqeOptions.cs +++ b/Rdmp.Core/CommandLine/Options/DqeOptions.cs @@ -19,4 +19,7 @@ public class DqeOptions : RDMPCommandLineOptions [Option('d', "DataLoad", HelpText = "ID of the Data Load to run the DQE on. Adds new data to existing DQE results if they exist", Required = false)] public string DataLoadUpdateID { get; set; } + + [Option('t', "Timeout", HelpText = "How long(in seconds) each internal SQL command should brun for before timing out")] + public int CommandTimeout { get; set; } } \ No newline at end of file diff --git a/Rdmp.Core/CommandLine/Runners/DqeRunner.cs b/Rdmp.Core/CommandLine/Runners/DqeRunner.cs index 76aeeaba05..36b71dc2bc 100644 --- a/Rdmp.Core/CommandLine/Runners/DqeRunner.cs +++ b/Rdmp.Core/CommandLine/Runners/DqeRunner.cs @@ -40,7 +40,7 @@ public override int Run(IRDMPPlatformRepositoryServiceLocator repositoryLocator, { case CommandLineActivity.run: if (dataLoadID is not null) - report.UpdateReport(catalogue, (int)dataLoadID, listener, token.AbortToken); + report.UpdateReport(catalogue, (int)dataLoadID, _options.CommandTimeout, listener, token.AbortToken); else report.GenerateReport(catalogue, listener, token.AbortToken); return 0; diff --git a/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs b/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs index 4e4b79f0fd..c9128e3270 100644 --- a/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs +++ b/Rdmp.Core/DataLoad/Modules/Mutilators/DQEPostLoadRunner.cs @@ -27,6 +27,8 @@ namespace Rdmp.Core.DataLoad.Modules.Mutilators; public class DQEPostLoadRunner : IMutilateDataTables { + [DemandsInitialization("Timeout length for each query required to run the DQE update",defaultValue:50000)] + public int Timeout { get; set; } public void Check(ICheckNotifier notifier) { } @@ -74,7 +76,8 @@ public ExitCodeType Mutilate(IDataLoadJob job) { Catalogue = catalogue.ID.ToString(), DataLoadUpdateID = job.DataLoadInfo.ID.ToString(), - Command = CommandLineActivity.run + Command = CommandLineActivity.run, + CommandTimeout = Timeout }; var runner = RunnerFactory.CreateRunner(new ThrowImmediatelyActivator(job.RepositoryLocator), options); runner.Run(job.RepositoryLocator, ThrowImmediatelyDataLoadEventListener.Quiet, new AcceptAllCheckNotifier(), diff --git a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs index 12cba851a8..9c51826277 100644 --- a/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/ColumnState.cs @@ -133,8 +133,7 @@ protected ColumnState() public void Commit(Evaluation evaluation, string pivotCategory, DbConnection con, DbTransaction transaction) { if (IsCommitted) - return; - //throw new NotSupportedException("ColumnState was already committed"); + throw new NotSupportedException("ColumnState was already committed"); var sql = $"INSERT INTO ColumnState(TargetProperty,DataLoadRunID,Evaluation_ID,CountCorrect,CountDBNull,ItemValidatorXML,CountMissing,CountWrong,CountInvalidatesRow,PivotCategory)VALUES({"@TargetProperty"},{DataLoadRunID},{evaluation.ID},{CountCorrect},{CountDBNull},@ItemValidatorXML,{CountMissing},{CountWrong},{CountInvalidatesRow},@PivotCategory)"; diff --git a/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs b/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs index a82e7498c5..7636c26deb 100644 --- a/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs +++ b/Rdmp.Core/DataQualityEngine/Data/PeriodicityState.cs @@ -7,7 +7,6 @@ using System; using System.Collections.Generic; using System.Data; -using Rdmp.Core.MapsDirectlyToDatabaseTable; using Rdmp.Core.ReusableLibraryCode; using Rdmp.Core.Validation.Constraints; diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 88058d5444..49bc023509 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -1,4 +1,4 @@ -// Copyright (c) The University of Dundee 2018-2019 +// Copyright (c) The University of Dundee 2018-2025 // This file is part of the Research Data Management Platform (RDMP). // RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. @@ -8,18 +8,10 @@ using System.Collections.Generic; using System.Data; using System.Data.Common; -using System.Diagnostics; using System.Linq; -using System.Reflection; -using System.Runtime.InteropServices; using System.Threading; using FAnsi.Discovery; using MongoDB.Driver; -using NPOI.OpenXmlFormats.Spreadsheet; -using NPOI.OpenXmlFormats.Vml; -using NPOI.SS.Formula.Functions; -using NPOI.Util; -using Org.BouncyCastle.Security.Certificates; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.Defaults; using Rdmp.Core.DataLoad.Triggers; @@ -27,7 +19,6 @@ using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; using Rdmp.Core.Logging; using Rdmp.Core.Logging.Listeners; -using Rdmp.Core.MapsDirectlyToDatabaseTable; using Rdmp.Core.QueryBuilding; using Rdmp.Core.Repositories; using Rdmp.Core.ReusableLibraryCode.Checks; @@ -36,7 +27,6 @@ using Rdmp.Core.Validation; using Rdmp.Core.Validation.Constraints; using Rdmp.Core.Validation.Constraints.Secondary.Predictor; -using static Terminal.Gui.Application; namespace Rdmp.Core.DataQualityEngine.Reports; @@ -54,10 +44,8 @@ public class CatalogueConstraintReport : DataQualityReport private Validator _validator; private bool _containsDataLoadID; - public static int MaximumPivotValues = 5000; - - private Dictionary byPivotRowStatesOverDataLoadRunId = new(); - private Dictionary byPivotCategoryCubesOverTime = new(); + private Dictionary byPivotRowStatesOverDataLoadRunId = []; + private Dictionary byPivotCategoryCubesOverTime = []; private IExternalDatabaseServer _loggingServer; private string _loggingTask; @@ -191,7 +179,7 @@ e is OperationCanceledException //may be worth thinking about how we can keep existing records and modify/add to them depending on what's goin on - public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener listener, + public void UpdateReport(ICatalogue c, int dataLoadID, int? commandTimeout, IDataLoadEventListener listener, CancellationToken cancellationToken) { _dataLoadID = dataLoadID; @@ -216,7 +204,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li if (_dataLoadID is not null) qb.AddCustomLine($"{SpecialFieldNames.DataLoadRunID} = {_dataLoadID}", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); var cmd = _server.GetCommand(qb.SQL, con); - cmd.CommandTimeout = 500000; + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; var adapter = _server.GetDataAdapter(cmd); rDT.BeginLoadData(); adapter.Fill(rDT); @@ -250,7 +239,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li var pivotColumnInfo = _catalogue.CatalogueItems.Where(ci => ci.Name == _pivotCategory).FirstOrDefault(); if (pivotColumnInfo is null) throw new Exception("Can't find column infor for pivot category"); var tableInfo = pivotColumnInfo.ColumnInfo.TableInfo; - var dataDiffFetcher = new DiffDatabaseDataFetcher(10000000, tableInfo, (int)_dataLoadID, 50000);//todo update these numbers + + var dataDiffFetcher = new DiffDatabaseDataFetcher(2147483647, tableInfo, (int)_dataLoadID, commandTimeout != null ? (int)commandTimeout : 30); dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); //pivot categories that have been replaces 100%? var replacedPivotCategories = previousCategories.Where(c => @@ -305,7 +295,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); var cmd = _server.GetCommand(qb.SQL, updateCon); - cmd.CommandTimeout = 500000; + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; var adapter = _server.GetDataAdapter(cmd); updatedRowsDataTable.BeginLoadData(); adapter.Fill(updatedRowsDataTable); @@ -337,7 +328,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } } } - List AllStates = new(); + List AllStates = []; foreach (var rowState in evaluation.RowStates) { if (!AllStates.Any(state => state.DataLoadRunID == rowState.DataLoadRunID)) @@ -375,7 +366,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li cm.Commit(evaluation, previousColumnState.PivotCategory, con.Connection, con.Transaction); ColumnStates.Add(cm); } - List AllColumns = new(); + List AllColumns = []; foreach (var columnState in ColumnStates) { if (!AllColumns.Any(state => state.DataLoadRunID == columnState.DataLoadRunID && state.TargetProperty == columnState.TargetProperty && state.PivotCategory == columnState.PivotCategory)) @@ -411,7 +402,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li //* Periodicity States *// //Unchanged - newByPivotCategoryCubesOverTime = new();//reset + newByPivotCategoryCubesOverTime = [];//reset var unchangedPivotCategories = previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory)).Select(rs => rs.PivotCategory).Distinct(); foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) ; newByPivotCategoryCubesOverTime.TryGetValue("ALL", out var value); @@ -434,7 +425,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li for (var i = 0; i < countOfRecords; i++) { - Consequence.TryParse(row[3].ToString(), out Consequence consequence); + Enum.TryParse(row[3].ToString(), out Consequence consequence); var date = DateTime.Parse(row[1].ToString()); newByPivotCategoryCubesOverTime[pivotCategory].IncrementHyperCube(date.Year, date.Month, consequence); @@ -454,7 +445,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', existingIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); var cmd = _server.GetCommand(qb.SQL, updateCon); - cmd.CommandTimeout = 500000; + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; var adapter = _server.GetDataAdapter(cmd); updatedRowsDataTable.BeginLoadData(); adapter.Fill(updatedRowsDataTable); @@ -473,7 +465,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li foreach (var month in periodicityCubes.Keys) { var cube = periodicityCubes[month]; - foreach (var consequence in Enum.GetValues(typeof(Consequence)).Cast().ToList()) + foreach (var consequence in Enum.GetValues().Cast().ToList()) { var state = cube.GetStateForConsequence(consequence); for (var i = 0; i < state.CountOfRecords; i++) @@ -490,11 +482,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } } - //want to add this to newByPivotCategoryCubesOverTime - } } - //foreach (var newCategory in newIncomingPivotCategories) if (newIncomingPivotCategories.Any()) { var updatedRowsDataTable = new DataTable(); @@ -506,7 +495,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li qb.AddColumnRange(_catalogue.GetAllExtractionInformation(ExtractionCategory.Any)); qb.AddCustomLine($"{pivotColumn} in ({string.Join(',', newIncomingPivotCategories.Select(i => $"'{i}'"))})", FAnsi.Discovery.QuerySyntax.QueryComponent.WHERE); var cmd = _server.GetCommand(qb.SQL, updateCon); - cmd.CommandTimeout = 500000; + if (commandTimeout is not null) + cmd.CommandTimeout = (int)commandTimeout; var adapter = _server.GetDataAdapter(cmd); updatedRowsDataTable.BeginLoadData(); adapter.Fill(updatedRowsDataTable); @@ -542,18 +532,14 @@ public void UpdateReport(ICatalogue c, int dataLoadID, IDataLoadEventListener li } } - //want to add this to newByPivotCategoryCubesOverTime } } - //ADD all the new stuff + //add all the new stuff foreach (var v in newByPivotCategoryCubesOverTime.Values) { v.CommitToDatabase(evaluation); } - - //var previousPeriodicity = PeriodicityState.GetPeriodicityForDataTableForEvaluation(previousEvaluation, false); - dqeRepository.EndTransactedConnection(true); } diff --git a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs index 954e4a8de2..6fafeae879 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs @@ -4,9 +4,7 @@ // RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. // You should have received a copy of the GNU General Public License along with RDMP. If not, see . using Rdmp.Core.Curation.Data; -using Rdmp.Core.DataLoad.Triggers; using Rdmp.Core.DataQualityEngine.Reports.PeriodicityHelpers; -using Rdmp.Core.Logging; using Rdmp.Core.QueryBuilding; using Rdmp.Core.Repositories; using Rdmp.Core.ReusableLibraryCode.Progress; @@ -18,9 +16,7 @@ using System.Data.Common; using System.Diagnostics; using System.Linq; -using System.Text; using System.Threading; -using System.Threading.Tasks; namespace Rdmp.Core.DataQualityEngine.Reports; @@ -32,22 +28,22 @@ public class ReportBuilder private readonly string _dataLoadRunFieldName; //where the data is located - private QueryBuilder _queryBuilder; - private Validator _validator; - private bool _containsDataLoadID; + private readonly QueryBuilder _queryBuilder; + private readonly Validator _validator; + private readonly bool _containsDataLoadID; public static int MaximumPivotValues = 5000; private Dictionary byPivotRowStatesOverDataLoadRunId = new(); private Dictionary byPivotCategoryCubesOverTime = new(); - private string _timePeriodicityField; - private string _pivotCategory; - private ICatalogue _catalogue; + private readonly string _timePeriodicityField; + private readonly string _pivotCategory; + private readonly ICatalogue _catalogue; private bool _haveComplainedAboutNullCategories; private bool _haveComplainedAboutTrailingWhitespaces; - private DataTable _resultsDT = new(); + private readonly DataTable _resultsDT = new(); public ReportBuilder(ICatalogue catalogue, Validator validator, QueryBuilder queryBuilder, string dataLoadRunFieldName, bool containsDataLoadID, string timePeriodicityField, string pivotCategory, DbDataReader results) { _catalogue = catalogue; diff --git a/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs b/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs index dda7484df6..d960895b3c 100644 --- a/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs +++ b/Rdmp.Core/QueryBuilding/CohortQueryBuilderResult.cs @@ -403,7 +403,6 @@ private void MakeCacheDecision() foreach (var dependency in Dependencies) { _log.AppendLine($"Evaluating '{dependency.CohortSet}'"); - var x = dependency.CohortSet.Catalogue.GetTableInfoList(false); foreach (var dependantTable in dependency.CohortSet.Catalogue.GetTableInfoList(false)) HandleDependency(dependency, false, dependantTable); From 2d3a7fadcb58a494551d179b919e54b990d0cc62 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 6 Jan 2025 08:21:40 +0000 Subject: [PATCH 32/35] tidy up --- .../DataQualityEngine/Reports/CatalogueConstraintReport.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 49bc023509..756d5a7b96 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -231,7 +231,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, int? commandTimeout, IDat var evaluation = new Evaluation(dqeRepository, _catalogue); - //new pivoutCategories coming in + //new pivotCategories coming in var newIncomingPivotCategories = incomingPivotCategories.Where(c => !previousCategories.Contains(c)); List ColumnStates = []; From 851416e4b5324ce2a13bf361d39941232cd0d558 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 6 Jan 2025 08:31:19 +0000 Subject: [PATCH 33/35] add docs --- CHANGELOG.md | 2 ++ Documentation/DataLoadEngine/DQEPostLoadRunner.md | 12 ++++++++++++ .../Reports/CatalogueConstraintReport.cs | 4 ++-- 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 Documentation/DataLoadEngine/DQEPostLoadRunner.md diff --git a/CHANGELOG.md b/CHANGELOG.md index f56222773f..737ed346d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - Build on and target .Net 9 rather than 8 +- Add DQE Updater Mutilator for Data Loads see [DQE Post Load runner](./Documentation/DataLoadEngine/DQEPostLoadRunner.md) + ## [8.4.2] - 2024-12-18 - Fix issue with MEF constructing Remote Table Attachers diff --git a/Documentation/DataLoadEngine/DQEPostLoadRunner.md b/Documentation/DataLoadEngine/DQEPostLoadRunner.md new file mode 100644 index 0000000000..d244e87a4a --- /dev/null +++ b/Documentation/DataLoadEngine/DQEPostLoadRunner.md @@ -0,0 +1,12 @@ +# DQE Post Load Runner + +The DQE post-load runner can be used to automatically perform a DQE update once a data load completes. +The runner attempts to reuse any existing DQE results that have been unaffected by the data load, however this process can still be slow if the catalogue data is large and/or complex. + +## Requirements +The DQE post-load runner requires an existing DQE result to exist, otherwise it will fail. + +## Configuration +The runner makes a number of queries to the database, the timeout for these commands is configurable via the timeout option. + + diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index 756d5a7b96..f8a65c9a82 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -239,7 +239,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, int? commandTimeout, IDat var pivotColumnInfo = _catalogue.CatalogueItems.Where(ci => ci.Name == _pivotCategory).FirstOrDefault(); if (pivotColumnInfo is null) throw new Exception("Can't find column infor for pivot category"); var tableInfo = pivotColumnInfo.ColumnInfo.TableInfo; - + var dataDiffFetcher = new DiffDatabaseDataFetcher(2147483647, tableInfo, (int)_dataLoadID, commandTimeout != null ? (int)commandTimeout : 30); dataDiffFetcher.FetchData(new AcceptAllCheckNotifier()); //pivot categories that have been replaces 100%? @@ -433,7 +433,7 @@ public void UpdateReport(ICatalogue c, int dataLoadID, int? commandTimeout, IDat } } } - //what about the replacements? + //replacements if (existingIncomingPivotCategories.Any()) { var updatedRowsDataTable = new DataTable(); From 1debdcb8449c577d71b1f052e4304323406a5adb Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 6 Jan 2025 10:21:28 +0000 Subject: [PATCH 34/35] codeql updates --- .../DQEPartialUpdateTests.cs | 29 +++++-------------- .../Reports/CatalogueConstraintReport.cs | 6 ++-- .../Reports/ReportBuilder.cs | 6 ++-- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs index 6b2952f7af..71737a6881 100644 --- a/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs +++ b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs @@ -1,8 +1,5 @@ -using NLog; -using NPOI.POIFS.Properties; -using NPOI.SS.Formula.Functions; +using NPOI.SS.Formula.Functions; using NUnit.Framework; -using Org.BouncyCastle.Tls; using Rdmp.Core.Curation; using Rdmp.Core.Curation.Data; using Rdmp.Core.Curation.Data.DataLoad; @@ -20,33 +17,25 @@ using Rdmp.Core.DataLoad.Triggers; using Rdmp.Core.DataQualityEngine.Data; using Rdmp.Core.DataQualityEngine.Reports; -using Rdmp.Core.Logging; using Rdmp.Core.Repositories; using Rdmp.Core.ReusableLibraryCode.Checks; using Rdmp.Core.ReusableLibraryCode.Progress; using Rdmp.Core.Tests.DataLoad.Engine.Integration; -using System; using System.Collections.Generic; using System.Data; using System.Diagnostics.CodeAnalysis; using System.IO; using System.Linq; -using System.Security.Cryptography; -using System.Text; -using System.Text.RegularExpressions; using System.Threading; -using System.Threading.Tasks; -using Tests.Common; -using static Org.BouncyCastle.Math.EC.ECCurve; namespace Rdmp.Core.Tests.DataQualityEngine { internal class DQEPartialUpdateTests : DataLoadEngineTestsBase { - string validatorXML = "\r\n\r\n \r\n \r\n \r\n Wrong\r\n \r\n chi\r\n \r\n \r\n \r\n time\r\n \r\n \r\n \r\n"; - string fileLocation = Path.GetTempPath(); - string fileName = "SteppedDQEPartialUpdates.csv"; + readonly string validatorXML = "\r\n\r\n \r\n \r\n \r\n Wrong\r\n \r\n chi\r\n \r\n \r\n \r\n time\r\n \r\n \r\n \r\n"; + readonly string fileLocation = Path.GetTempPath(); + readonly string fileName = "SteppedDQEPartialUpdates.csv"; [Test] public void SteppedDQEPartialUpdates() @@ -63,7 +52,7 @@ public void SteppedDQEPartialUpdates() var table = server.CreateTable("PartialToaDQE", dt); table.CreatePrimaryKey(table.DiscoverColumns().Where(c => c.GetRuntimeName() == "chi").ToArray()); - + dt.Dispose(); var catalogue = new Catalogue(CatalogueRepository, "PartialToaDQE"); var importer = new TableInfoImporter(CatalogueRepository, table); importer.DoImport(out var _tableInfo, out var _columnInfos); @@ -93,7 +82,7 @@ public void SteppedDQEPartialUpdates() var listener = new ToMemoryDataLoadEventListener(false); report.GenerateReport(catalogue, listener, source.Token); - + source.Dispose(); var lmd = new LoadMetadata(CatalogueRepository, "MyLoad"); lmd.LocationOfForLoadingDirectory = Path.GetTempPath(); lmd.LocationOfForArchivingDirectory = Path.GetTempPath(); @@ -144,7 +133,6 @@ public void SteppedDQEPartialUpdates() dt.Rows.Add(new string[] { "1111111112", "A", "2024-11-01" }); dt.Rows.Add(new string[] { "1111111113", "B", "2024-10-01" }); SetupFile(dt); - PerformLoad(lmd, logManager); //end of first load report = new CatalogueConstraintReport(catalogue, SpecialFieldNames.DataLoadRunID) @@ -296,10 +284,6 @@ public void SteppedDQEPartialUpdates() evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList();//.Where(e => e.CatalogueID == catalogue.ID).ToList(); Assert.That(evaluations.Count, Is.EqualTo(13)); CompareEvaluations(evaluations[12], evaluations[11]); - - - Assert.That(true, Is.EqualTo(true)); - } private void SetupFile(DataTable dt) @@ -318,6 +302,7 @@ private void SetupFile(DataTable dt) lines.Add(string.Join(',', row.ItemArray.Select(i => i.ToString()))); } File.AppendAllLines(Path.Combine(fileLocation, fileName), lines); + dt.Dispose(); } private void CompareEvaluations(Evaluation e1, Evaluation e2) diff --git a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs index f8a65c9a82..4c46c07d52 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/CatalogueConstraintReport.cs @@ -215,10 +215,8 @@ public void UpdateReport(ICatalogue c, int dataLoadID, int? commandTimeout, IDat var reportBuilder = new ReportBuilder(c, _validator, _queryBuilder, _dataLoadRunFieldName, _containsDataLoadID, _timePeriodicityField, _pivotCategory, rDT); reportBuilder.BuildReportInternals(cancellationToken, forker, dqeRepository); var newByPivotRowStatesOverDataLoadRunId = reportBuilder.GetByPivotRowStatesOverDataLoadRunId(); - var newByPivotCategoryCubesOverTime = reportBuilder.GetByPivotCategoryCubesOverTime(); var pivotColumn = c.PivotCategory_ExtractionInformation.ColumnInfo.GetRuntimeName(); - var timeColumn = c.TimeCoverage_ExtractionInformation.ColumnInfo.GetRuntimeName(); var incomingPivotCategories = rDT.AsEnumerable().Select(r => r[pivotColumn].ToString()).ToList().Distinct(); @@ -402,9 +400,9 @@ public void UpdateReport(ICatalogue c, int dataLoadID, int? commandTimeout, IDat //* Periodicity States *// //Unchanged - newByPivotCategoryCubesOverTime = [];//reset + Dictionary newByPivotCategoryCubesOverTime = [];//reset - var unchangedPivotCategories = previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory)).Select(rs => rs.PivotCategory).Distinct(); foreach (var previousRowState in previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory))) ; + var unchangedPivotCategories = previousRowSates.Where(rs => rs.PivotCategory != "ALL" && !existingIncomingPivotCategories.Contains(rs.PivotCategory) && !replacedPivotCategories.Contains(rs.PivotCategory)).Select(rs => rs.PivotCategory).Distinct(); newByPivotCategoryCubesOverTime.TryGetValue("ALL", out var value); if (value is null) { diff --git a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs index 6fafeae879..d63016b12a 100644 --- a/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs +++ b/Rdmp.Core/DataQualityEngine/Reports/ReportBuilder.cs @@ -32,10 +32,10 @@ public class ReportBuilder private readonly Validator _validator; private readonly bool _containsDataLoadID; - public static int MaximumPivotValues = 5000; + private static readonly int MaximumPivotValues = 5000; - private Dictionary byPivotRowStatesOverDataLoadRunId = new(); - private Dictionary byPivotCategoryCubesOverTime = new(); + private readonly Dictionary byPivotRowStatesOverDataLoadRunId = []; + private readonly Dictionary byPivotCategoryCubesOverTime = []; private readonly string _timePeriodicityField; private readonly string _pivotCategory; From fcac338d6d546daa699344dc8104893cd7ef7679 Mon Sep 17 00:00:00 2001 From: James Friel Date: Mon, 6 Jan 2025 10:59:34 +0000 Subject: [PATCH 35/35] update test --- Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs index 71737a6881..e58425ba1a 100644 --- a/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs +++ b/Rdmp.Core.Tests/DataQualityEngine/DQEPartialUpdateTests.cs @@ -82,7 +82,6 @@ public void SteppedDQEPartialUpdates() var listener = new ToMemoryDataLoadEventListener(false); report.GenerateReport(catalogue, listener, source.Token); - source.Dispose(); var lmd = new LoadMetadata(CatalogueRepository, "MyLoad"); lmd.LocationOfForLoadingDirectory = Path.GetTempPath(); lmd.LocationOfForArchivingDirectory = Path.GetTempPath(); @@ -280,8 +279,8 @@ public void SteppedDQEPartialUpdates() report.Check(ThrowImmediatelyCheckNotifier.Quiet); report.GenerateReport(catalogue, listener, source.Token); - - evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList();//.Where(e => e.CatalogueID == catalogue.ID).ToList(); + source.Dispose(); + evaluations = dqeRepository.GetAllObjectsWhere("CatalogueID", catalogue.ID).ToList(); Assert.That(evaluations.Count, Is.EqualTo(13)); CompareEvaluations(evaluations[12], evaluations[11]); }