From 6525984f5a4c62941e6bf1f734659a1e46537258 Mon Sep 17 00:00:00 2001 From: Heinrich Lukas Weil Date: Tue, 1 Aug 2023 14:13:13 +0400 Subject: [PATCH 1/3] fix reference column grouping and empty parsing --- src/ISA/ISA.Spreadsheet/ArcTable.fs | 6 +- src/ISA/ISA/ArcTypes/CompositeColumn.fs | 3 +- src/ISA/ISA/Regex.fs | 22 +++++-- tests/ISA/ISA.Tests/Identifier.Tests.fs | 2 +- tests/ISA/ISA.Tests/Regex.Tests.fs | 88 ++++++++++++++++++++++++- 5 files changed, 110 insertions(+), 11 deletions(-) diff --git a/src/ISA/ISA.Spreadsheet/ArcTable.fs b/src/ISA/ISA.Spreadsheet/ArcTable.fs index 75ecfbbf..9d67604d 100644 --- a/src/ISA/ISA.Spreadsheet/ArcTable.fs +++ b/src/ISA/ISA.Spreadsheet/ArcTable.fs @@ -62,11 +62,11 @@ let annotationTablePrefix = "annotationTable" let groupColumnsByHeader (columns : list) = columns - |> Aux.List.groupWhen (fun c -> - ISA.Regex.tryParseTermAnnotation c.[1].Value + |> Aux.List.groupWhen (fun c -> + ISA.Regex.tryParseReferenceColumnHeader c.[1].Value |> Option.isNone && - c.[1].Value <> "Unit" + (c.[1].Value.StartsWith "Unit" |> not) ) /// Returns the annotation table of the worksheet if it exists, else returns None diff --git a/src/ISA/ISA/ArcTypes/CompositeColumn.fs b/src/ISA/ISA/ArcTypes/CompositeColumn.fs index e89025e2..90f00bbb 100644 --- a/src/ISA/ISA/ArcTypes/CompositeColumn.fs +++ b/src/ISA/ISA/ArcTypes/CompositeColumn.fs @@ -32,8 +32,7 @@ type CompositeColumn = { true | h, c -> if raiseExeption then - let n = System.Math.Min(c.Length,3) - let exampleCells = c.[n] + let exampleCells = c.[0] let msg = $"Invalid combination of header `{h}` and cells `{exampleCells}`" failwith msg // Maybe still return `msg` somehow if `raiseExeption` is false? diff --git a/src/ISA/ISA/Regex.fs b/src/ISA/ISA/Regex.fs index a7141cef..3fe09429 100644 --- a/src/ISA/ISA/Regex.fs +++ b/src/ISA/ISA/Regex.fs @@ -36,7 +36,7 @@ module Pattern = /// /// the id part "MS:1003022" is captured as `id` group. [] - let ReferenceColumnPattern = @"(Term Source REF|Term Accession Number)\s\((?.+)\)" + let ReferenceColumnPattern = @"(Term Source REF|Term Accession Number)\s\((?.*)\)" /// Hits Term Accession Number column header /// @@ -44,7 +44,7 @@ module Pattern = /// /// the id part "MS:1003022" is captured as `id` group. [] - let TermSourceREFColumnPattern = @"Term Source REF\s\((?.+)\)" + let TermSourceREFColumnPattern = @"Term Source REF\s\((?.*)\)" /// Hits Term Source REF column header /// @@ -52,7 +52,7 @@ module Pattern = /// /// the id part "MS:1003022" is captured as `id` group. [] - let TermAccessionNumberColumnPattern = @"Term Accession Number\s\((?.+)\)" + let TermAccessionNumberColumnPattern = @"Term Accession Number\s\((?.*)\)" /// Hits term accession, without id: ENVO:01001831 [] @@ -107,6 +107,14 @@ module ActivePatterns = if m.Success then Some(m) else None + /// Matches any column header starting with some text, followed by one whitespace and a term name inside squared brackets. + let (|ReferenceColumnHeader|_|) input = + match input with + | Regex Pattern.ReferenceColumnPattern r -> + {|Annotation = r.Groups.["id"].Value|} + |> Some + | _ -> None + /// Matches any column header starting with some text, followed by one whitespace and a term name inside squared brackets. let (|TermColumn|_|) input = match input with @@ -191,7 +199,7 @@ module ActivePatterns = | Regex Pattern.TermSourceREFColumnPattern r -> match r.Groups.["id"].Value with | TermAnnotation r -> Some r - | _ -> None + | _ -> Some {|LocalTAN = ""; TermAccessionNumber = ""; TermSourceREF = ""|} | _ -> None /// Matches a "Term Accession Number (ShortTerm)" column header and returns the ShortTerm as Term Source Ref and Annotation Number. @@ -202,6 +210,7 @@ module ActivePatterns = | Regex Pattern.TermAccessionNumberColumnPattern r -> match r.Groups.["id"].Value with | TermAnnotation r -> Some r + | _ -> Some {|LocalTAN = ""; TermAccessionNumber = ""; TermSourceREF = ""|} | _ -> None | _ -> None @@ -235,6 +244,11 @@ open System open System.Text.RegularExpressions +let tryParseReferenceColumnHeader (str : string) = + match str.Trim() with + | ReferenceColumnHeader v -> + Some v + | _ -> None let tryParseTermAnnotationShort (str:string) = match str.Trim() with diff --git a/tests/ISA/ISA.Tests/Identifier.Tests.fs b/tests/ISA/ISA.Tests/Identifier.Tests.fs index 1759b2e0..0ad6d956 100644 --- a/tests/ISA/ISA.Tests/Identifier.Tests.fs +++ b/tests/ISA/ISA.Tests/Identifier.Tests.fs @@ -35,6 +35,6 @@ let private tests_checkValidCharacters = testList "checkValidCharacters" [ let main = - testList "ArcInvestigation" [ + testList "Identifier" [ tests_checkValidCharacters ] \ No newline at end of file diff --git a/tests/ISA/ISA.Tests/Regex.Tests.fs b/tests/ISA/ISA.Tests/Regex.Tests.fs index 943157b7..b27c215e 100644 --- a/tests/ISA/ISA.Tests/Regex.Tests.fs +++ b/tests/ISA/ISA.Tests/Regex.Tests.fs @@ -8,7 +8,7 @@ open Fable.Mocha open Expecto #endif -let tests_AutoGeneratedTableName = +let private tests_AutoGeneratedTableName = testList "AutoGeneratedTableName" [ testCase "match" (fun () -> let testString = @"New Table 10" @@ -53,8 +53,94 @@ let tests_AutoGeneratedTableName = ) ] +let private tests_AnnotationTableColums = + testList "AnnotationTableColumns" [ + testCase "Term Source REF" (fun () -> + let localID = "12345" + let space = "UO" + let testString = $"Term Source REF ({space}:{localID})" + let r = + match testString with + | Regex.ActivePatterns.TSRColumnHeader result -> Some result + | _ -> None + Expect.isSome r "Could not match TSRColumnHeader" + let rv = r.Value + Expect.equal rv.LocalTAN localID "LocalId did not match" + Expect.equal rv.TermSourceREF space "TermSourceREF did not match" + Expect.equal rv.TermAccessionNumber $"{space}:{localID}" "TermAccessionNumber did not match" + ) + testCase "Term Source REF Empty" (fun () -> + let testString = $"Term Source REF ()" + let r = + match testString with + | Regex.ActivePatterns.TSRColumnHeader result -> Some result + | _ -> None + Expect.isSome r "Could not match TSRColumnHeader" + let rv = r.Value + Expect.equal rv.LocalTAN "" "LocalID should be empty" + Expect.equal rv.TermSourceREF "" "TermSourceREF should be empty" + Expect.equal rv.TermAccessionNumber "" "TermAccessionNumber should be empty" + ) + testCase "Term Accession Number" (fun () -> + let localID = "12345" + let space = "UO" + let testString = $"Term Accession Number ({space}:{localID})" + let r = + match testString with + | Regex.ActivePatterns.TANColumnHeader result -> Some result + | _ -> None + Expect.isSome r "Could not match TANColumnHeader" + let rv = r.Value + Expect.equal rv.LocalTAN localID "LocalId did not match" + Expect.equal rv.TermSourceREF space "TermSourceREF did not match" + Expect.equal rv.TermAccessionNumber $"{space}:{localID}" "TermAccessionNumber did not match" + ) + testCase "Term Accession Number Empty" (fun () -> + let testString = $"Term Accession Number ()" + let r = + match testString with + | Regex.ActivePatterns.TANColumnHeader result -> Some result + | _ -> None + Expect.isSome r "Could not match TANColumnHeader" + let rv = r.Value + Expect.equal rv.LocalTAN "" "LocalID should be empty" + Expect.equal rv.TermSourceREF "" "TermSourceREF should be empty" + Expect.equal rv.TermAccessionNumber "" "TermAccessionNumber should be empty" + ) + testCase "Reference Column Header Empty" (fun () -> + + let testString = $"Term Accession Number ()" + let r = + match testString with + | Regex.ActivePatterns.ReferenceColumnHeader result -> Some result + | _ -> None + Expect.isSome r "Could not match TANColumnHeader" + let rv = r.Value + Expect.equal rv.Annotation "" "Annotation should be empty" + + let testString = $"Term Source REF ()" + let r = + match testString with + | Regex.ActivePatterns.ReferenceColumnHeader result -> Some result + | _ -> None + Expect.isSome r "Could not match TANColumnHeader" + let rv = r.Value + Expect.equal rv.Annotation "" "Annotation should be empty" + + let testString = $"Any Other String ()" + let r = + match testString with + | Regex.ActivePatterns.ReferenceColumnHeader result -> Some result + | _ -> None + Expect.isNone r "Should not match other String" + ) + ] + + + let main = testList "Regex" [ + tests_AnnotationTableColums tests_AutoGeneratedTableName ] From 0662e51572097e22af763b9c3a87d1578db7fc4c Mon Sep 17 00:00:00 2001 From: Heinrich Lukas Weil Date: Tue, 1 Aug 2023 14:13:52 +0400 Subject: [PATCH 2/3] update FsSpreadsheet reference to 3.2.0 --- src/ISA/ISA.Spreadsheet/ArcAssay.fs | 4 ++-- src/ISA/ISA.Spreadsheet/ArcStudy.fs | 4 ++-- src/ISA/ISA.Spreadsheet/ISA.Spreadsheet.fsproj | 2 +- .../ISA.Spreadsheet/InvestigationFile/Investigation.fs | 2 +- tests/ISA/ISA.Spreadsheet.Tests/FableTests.fs | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ISA/ISA.Spreadsheet/ArcAssay.fs b/src/ISA/ISA.Spreadsheet/ArcAssay.fs index 544d3a3d..a96a0d05 100644 --- a/src/ISA/ISA.Spreadsheet/ArcAssay.fs +++ b/src/ISA/ISA.Spreadsheet/ArcAssay.fs @@ -87,8 +87,8 @@ let fromFsWorkbook (doc:FsWorkbook) = ArcAssay.create(Identifier.createMissingIdentifier()) let sheets = doc.GetWorksheets() - |> List.choose ArcTable.tryFromFsWorksheet - if sheets.IsEmpty then + |> Seq.choose ArcTable.tryFromFsWorksheet + if sheets |> Seq.isEmpty then assayMetaData else assayMetaData.Tables <- ResizeArray(sheets) diff --git a/src/ISA/ISA.Spreadsheet/ArcStudy.fs b/src/ISA/ISA.Spreadsheet/ArcStudy.fs index 9f3073b2..811c5d0c 100644 --- a/src/ISA/ISA.Spreadsheet/ArcStudy.fs +++ b/src/ISA/ISA.Spreadsheet/ArcStudy.fs @@ -51,8 +51,8 @@ let fromFsWorkbook (doc:FsWorkbook) = let sheets = doc.GetWorksheets() - |> List.choose ArcTable.tryFromFsWorksheet - if sheets.IsEmpty then + |> Seq.choose ArcTable.tryFromFsWorksheet + if sheets |> Seq.isEmpty then studyMetadata else studyMetadata.Tables <- ResizeArray(sheets) diff --git a/src/ISA/ISA.Spreadsheet/ISA.Spreadsheet.fsproj b/src/ISA/ISA.Spreadsheet/ISA.Spreadsheet.fsproj index aa4abb84..303af82e 100644 --- a/src/ISA/ISA.Spreadsheet/ISA.Spreadsheet.fsproj +++ b/src/ISA/ISA.Spreadsheet/ISA.Spreadsheet.fsproj @@ -33,7 +33,7 @@ - + diff --git a/src/ISA/ISA.Spreadsheet/InvestigationFile/Investigation.fs b/src/ISA/ISA.Spreadsheet/InvestigationFile/Investigation.fs index 322de8ad..cc36a311 100644 --- a/src/ISA/ISA.Spreadsheet/InvestigationFile/Investigation.fs +++ b/src/ISA/ISA.Spreadsheet/InvestigationFile/Investigation.fs @@ -192,7 +192,7 @@ module ArcInvestigation = let fromFsWorkbook (doc:FsWorkbook) = try doc.GetWorksheets() - |> List.head + |> Seq.head |> FsWorksheet.getRows |> Seq.map SparseRow.fromFsRow |> fromRows diff --git a/tests/ISA/ISA.Spreadsheet.Tests/FableTests.fs b/tests/ISA/ISA.Spreadsheet.Tests/FableTests.fs index 18d1273a..dc96b403 100644 --- a/tests/ISA/ISA.Spreadsheet.Tests/FableTests.fs +++ b/tests/ISA/ISA.Spreadsheet.Tests/FableTests.fs @@ -18,18 +18,18 @@ let tests_typeTranspilation = testList "type transpilation" [ let wb = new FsWorkbook() let ws = FsWorksheet.init ("My Worksheet") wb.AddWorksheet(ws) - Expect.equal (wb.GetWorksheets().Length) 1 "length" + Expect.equal (wb.GetWorksheets().Count) 1 "length" Expect.equal (wb.GetWorksheets().[0].Name) "My Worksheet" "length" testCase "FsTable" <| fun _ -> let wb = new FsWorkbook() let ws = FsWorksheet.init ("My Worksheet") wb.AddWorksheet(ws) - Expect.equal (wb.GetWorksheets().Length) 1 "length" + Expect.equal (wb.GetWorksheets().Count) 1 "length" Expect.equal (wb.GetWorksheets().[0].Name) "My Worksheet" "length" - let table = FsTable("My Table",FsRangeAddress(FsAddress(1,1),FsAddress(5,5))) + let table = FsTable("MyTable",FsRangeAddress(FsAddress(1,1),FsAddress(5,5))) ws.AddTable(table) |> ignore Expect.equal (wb.GetTables().Length) 1 "table length" - Expect.equal (wb.GetTables().[0].Name) "My Table" "table name" + Expect.equal (wb.GetTables().[0].Name) "MyTable" "table name" ] From c2a889931acad4ab3e18e56bb8befb76c88cb612 Mon Sep 17 00:00:00 2001 From: Heinrich Lukas Weil Date: Tue, 1 Aug 2023 14:14:08 +0400 Subject: [PATCH 3/3] add spreadsheet component header parsing --- src/ISA/ISA.Spreadsheet/CompositeHeader.fs | 10 ++++++- src/ISA/ISA/Regex.fs | 16 +++++++++- .../ISA.Spreadsheet.Tests/ArcTableTests.fs | 5 +++- .../TestObjects/ArcTable.fs | 29 +++++++++++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/src/ISA/ISA.Spreadsheet/CompositeHeader.fs b/src/ISA/ISA.Spreadsheet/CompositeHeader.fs index 4d442e63..41045d9b 100644 --- a/src/ISA/ISA.Spreadsheet/CompositeHeader.fs +++ b/src/ISA/ISA.Spreadsheet/CompositeHeader.fs @@ -49,6 +49,12 @@ module ActivePattern = Some r | _ -> None + let (|Component|_|) (cells : FsCell list) = + match cells with + | Term Regex.tryParseComponentColumnHeader CompositeHeader.Component r -> + Some r + | _ -> None + let (|Input|_|) (cells : FsCell list) = let cellValues = cells |> List.map (fun c -> c.Value) match cellValues with @@ -95,11 +101,13 @@ let fromFsCells (cells : list) : CompositeHeader = | Parameter p -> p | Factor f -> f | Characteristic c -> c + | Component c -> c | Input i -> i | Output o -> o | ProtocolHeader ph -> ph | FreeText ft -> ft - | _ -> raise (System.NotImplementedException("parseCompositeHeader")) + | _ -> failwithf "Could not parse header group %O" cells + let toFsCells (hasUnit : bool) (header : CompositeHeader) : list = if header.IsSingleColumn then diff --git a/src/ISA/ISA/Regex.fs b/src/ISA/ISA/Regex.fs index 3fe09429..109e4ea4 100644 --- a/src/ISA/ISA/Regex.fs +++ b/src/ISA/ISA/Regex.fs @@ -160,6 +160,15 @@ module ActivePatterns = | _ -> None | _ -> None + /// Matches a "Component [Term]" or "Component Value [Term]" column header and returns the Term string. + let (|ComponentColumnHeader|_|) input = + match input with + | TermColumn r -> + match r.TermColumnType with + | "Component" + | "Component Value" -> Some r.TermName + | _ -> None + | _ -> None /// Matches a short term string and returns the term source ref and the annotation number strings. /// @@ -211,7 +220,6 @@ module ActivePatterns = match r.Groups.["id"].Value with | TermAnnotation r -> Some r | _ -> Some {|LocalTAN = ""; TermAccessionNumber = ""; TermSourceREF = ""|} - | _ -> None | _ -> None /// Matches a "Input [InputType]" column header and returns the InputType as string. @@ -348,6 +356,12 @@ let tryParseCharacteristicColumnHeader input = | CharacteristicColumnHeader r -> Some r | _ -> None +/// Matches a "Component [Term]" or "Characteristics [Term]" or "Component Value [Term]" column header and returns the Term string. +let tryParseComponentColumnHeader input = + match input with + | ComponentColumnHeader r -> Some r + | _ -> None + /// Matches a "Term Source REF (ShortTerm)" column header and returns the ShortTerm as Term Source Ref and Annotation Number. /// /// Example: "Term Source REF (MS:1003022)" --> term source ref: "MS"; annotation number: "1003022" diff --git a/tests/ISA/ISA.Spreadsheet.Tests/ArcTableTests.fs b/tests/ISA/ISA.Spreadsheet.Tests/ArcTableTests.fs index 7f147bfe..386fe52f 100644 --- a/tests/ISA/ISA.Spreadsheet.Tests/ArcTableTests.fs +++ b/tests/ISA/ISA.Spreadsheet.Tests/ArcTableTests.fs @@ -52,6 +52,7 @@ let private simpleTable = [ Protocol.REF.appendLolColumn 1 Protocol.Type.appendCollectionColumn 1 + Protocol.Component.appendInstrumentColumn 1 Parameter.appendTemperatureColumn 1 Parameter.appendInstrumentColumn 1 Characteristic.appendOrganismColumn 1 @@ -65,13 +66,14 @@ let private simpleTable = let table = table.Value Expect.equal table.Name wsName "Name did not match" - Expect.equal table.ColumnCount 6 "Wrong number of columns" + Expect.equal table.ColumnCount 7 "Wrong number of columns" Expect.equal table.RowCount 1 "Wrong number of rows" let expectedHeaders = [ Protocol.REF.lolHeader Protocol.Type.collectionHeader + Protocol.Component.instrumentHeader Parameter.temperatureHeader Parameter.instrumentHeader Characteristic.organismHeader @@ -83,6 +85,7 @@ let private simpleTable = [ Protocol.REF.lolValue Protocol.Type.collectionValue + Protocol.Component.instrumentValue Parameter.temperatureValue Parameter.instrumentValue Characteristic.organismValue diff --git a/tests/ISA/ISA.Spreadsheet.Tests/TestObjects/ArcTable.fs b/tests/ISA/ISA.Spreadsheet.Tests/TestObjects/ArcTable.fs index 8ebb52f5..d3cf7103 100644 --- a/tests/ISA/ISA.Spreadsheet.Tests/TestObjects/ArcTable.fs +++ b/tests/ISA/ISA.Spreadsheet.Tests/TestObjects/ArcTable.fs @@ -256,6 +256,35 @@ module Protocol = t.Cell(FsAddress(i, colCount + 2),c).SetValueAs collectionValueV2 t.Cell(FsAddress(i, colCount + 3),c).SetValueAs collectionValueV3 + module Component = + + let instrumentHeader = + CompositeHeader.Component + (OntologyAnnotation.fromString("instrument model","MS","MS:1000031")) + let instrumentValue = + CompositeCell.createTermFromString + ("Thermo Fisher Scientific instrument model","MS","http://purl.obolibrary.org/obo/MS_1000483") + + + let instrumentHeaderV1 = "Component [instrument model]" + let instrumentHeaderV2 = "Term Source REF (MS:1000031)" + let instrumentHeaderV3 = "Term Accession Number (MS:1000031)" + + let instrumentValueV1 = "Thermo Fisher Scientific instrument model" + let instrumentValueV2 = "MS" + let instrumentValueV3 = "http://purl.obolibrary.org/obo/MS_1000483" + + let appendInstrumentColumn l (c : FsCellsCollection) (t : FsTable) = + let colCount = if t.IsEmpty(c) then 0 else t.ColumnCount() + t.Cell(FsAddress(1, colCount + 1),c).SetValueAs instrumentHeaderV1 + t.Cell(FsAddress(1, colCount + 2),c).SetValueAs instrumentHeaderV2 + t.Cell(FsAddress(1, colCount + 3),c).SetValueAs instrumentHeaderV3 + for i = 2 to l + 1 do + t.Cell(FsAddress(i, colCount + 1),c).SetValueAs instrumentValueV1 + t.Cell(FsAddress(i, colCount + 2),c).SetValueAs instrumentValueV2 + t.Cell(FsAddress(i, colCount + 3),c).SetValueAs instrumentValueV3 + + let initTable (appendOperations : (FsCellsCollection -> FsTable -> unit) list)= let c = FsCellsCollection() let t = FsTable(ArcTable.annotationTablePrefix, FsRangeAddress("A1:A1"))