From 775f9cf31ab034614afb4ee6cc333371cc5a42e9 Mon Sep 17 00:00:00 2001 From: Geoffrey Teale Date: Sun, 3 Nov 2024 14:58:14 +0100 Subject: [PATCH] Use sync pools for shared string parser --- file.go | 9 ++-- lib.go | 126 ++++++++++++++++++++++++++++++++++++++++++----- lib_test.go | 76 +++++++++++----------------- reftable.go | 58 ++++++++++++---------- reftable_test.go | 11 ++--- 5 files changed, 184 insertions(+), 96 deletions(-) diff --git a/file.go b/file.go index c57c7919..1d1e3625 100644 --- a/file.go +++ b/file.go @@ -335,7 +335,7 @@ func autoFilterDefinedName(sheet *Sheet, sheetIndex int) (*xlsxDefinedName, erro // representing the file in terms of the structure of an XLSX file. func (f *File) MakeStreamParts() (map[string]string, error) { var parts map[string]string - var refTable *RefTable = NewSharedStringRefTable(10000) // 10000 is arbitrary + var refTable *RefTable = NewSharedStringRefTable(DEFAULT_REFTABLE_SIZE) refTable.isWrite = true var workbookRels WorkBookRels = make(WorkBookRels) var err error @@ -465,7 +465,7 @@ func (f *File) MakeStreamParts() (map[string]string, error) { // MarshallParts constructs a map of file name to XML content representing the file // in terms of the structure of an XLSX file. func (f *File) MarshallParts(zipWriter *zip.Writer) error { - var refTable *RefTable = NewSharedStringRefTable(10000) // 10000 is arbitrary + var refTable *RefTable = NewSharedStringRefTable(DEFAULT_REFTABLE_SIZE) refTable.isWrite = true var workbookRels WorkBookRels = make(WorkBookRels) var err error @@ -650,9 +650,10 @@ func (f *File) MarshallParts(zipWriter *zip.Writer) error { // Here, value would be set to the raw value of the cell A1 in the // first sheet in the XLSX file. func (f *File) ToSlice() (output [][][]string, err error) { - output = [][][]string{} + sheetCount := len(f.Sheets) + output = make([][][]string, 0, sheetCount) for _, sheet := range f.Sheets { - s := [][]string{} + s := make([][]string, 0, sheet.MaxRow) err := sheet.ForEachRow(func(row *Row) error { r := []string{} err := row.ForEachCell(func(cell *Cell) error { diff --git a/lib.go b/lib.go index 3e71b9c4..6217f11e 100644 --- a/lib.go +++ b/lib.go @@ -14,6 +14,7 @@ import ( "runtime/debug" "strconv" "strings" + "sync" ) const ( @@ -22,6 +23,26 @@ const ( externalSheetBangChar = "!" ) +var ( + tokPool = sync.Pool{ + New: func() interface{} { + return &xml.StartElement{} + }, + } + + xlsxSIPool = sync.Pool{ + New: func() interface{} { + return &xlsxSI{} + }, + } + + xmlAttrPool = sync.Pool{ + New: func() interface{} { + return &xml.Attr{} + }, + } +) + // XLSXReaderError is the standard error type for otherwise undefined // errors in the XSLX reading process. type XLSXReaderError struct { @@ -845,15 +866,104 @@ func readSheetsFromZipFile(f *zip.File, file *File, sheetXMLMap map[string]strin return sheetsByName, sheets, err } +func readSharedStrings(rc io.Reader) (*RefTable, error) { + var err error + var decoder *xml.Decoder + var reftable *RefTable + var tok xml.Token + var count int + var countS string + var ok bool + var si *xlsxSI + var attr *xml.Attr + + wrap := func(err error) (*RefTable, error) { + return nil, fmt.Errorf("readSharedStrings: %w", err) + } + + decoder = xml.NewDecoder(rc) + + for { + tok = tokPool.Get().(xml.Token) + tok, err = decoder.Token() + if tok == nil { + break + } else if err == io.EOF { + break + } + if err != nil { + return wrap(err) + } + switch ty := tok.(type) { + case xml.StartElement: + switch ty.Name.Local { + case "sst": + attr = xmlAttrPool.Get().(*xml.Attr) + ok = false + for _, (*attr) = range ty.Attr { + if attr.Name.Local == "count" { + countS = attr.Value + ok = true + break + } + } + xmlAttrPool.Put(attr) + if !ok { + // No hints on the size, so we'll just start with + // a decent number of entries to avoid small + // allocs. + reftable = NewSharedStringRefTable(DEFAULT_REFTABLE_SIZE) + reftable.isWrite = false //Todo, do we actually use this? + } else { + count, err = strconv.Atoi(countS) + if err != nil { + return wrap(err) + } + reftable = NewSharedStringRefTable(count) + reftable.isWrite = false //Todo, do we actually use this? + } + case "si": + if reftable == nil { + return wrap(fmt.Errorf("si encountered before reftable created")) + } + si = xlsxSIPool.Get().(*xlsxSI) + if err = decoder.DecodeElement(si, &ty); err != nil { + xlsxSIPool.Put(si) + return wrap(err) + } + if len(si.R) > 0 { + reftable.AddRichText(xmlToRichText(si.R)) + } else { + reftable.AddString(si.T.getText()) + } + // clean up before returning to the pool, without + // these lines you'll see weird effects when reading + // another set of shared strings + si.R = nil + si.T = nil + xlsxSIPool.Put(si) + default: + // Do nothing + } + default: + // Do nothing + } + tokPool.Put(tok) + } + + if reftable == nil { + panic("Unitialised reftable") + } + return reftable, nil + +} + // readSharedStringsFromZipFile() is an internal helper function to // extract a reference table from the sharedStrings.xml file within // the XLSX zip file. func readSharedStringsFromZipFile(f *zip.File) (*RefTable, error) { - var sst *xlsxSST var err error var rc io.ReadCloser - var decoder *xml.Decoder - var reftable *RefTable wrap := func(err error) (*RefTable, error) { return nil, fmt.Errorf("readSharedStringsFromZipFile: %w", err) @@ -870,15 +980,7 @@ func readSharedStringsFromZipFile(f *zip.File) (*RefTable, error) { return wrap(err) } defer rc.Close() - - sst = new(xlsxSST) - decoder = xml.NewDecoder(rc) - err = decoder.Decode(sst) - if err != nil { - return wrap(err) - } - reftable = MakeSharedStringRefTable(sst) - return reftable, nil + return readSharedStrings(rc) } // readStylesFromZipFile() is an internal helper function to diff --git a/lib_test.go b/lib_test.go index 045554a2..ca40d951 100644 --- a/lib_test.go +++ b/lib_test.go @@ -284,6 +284,7 @@ func TestLib(t *testing.T) { // }) csRunC(c, "ReadRowsFromSheet", func(c *qt.C, constructor CellStoreConstructor) { + var err error var sharedstringsXML = bytes.NewBufferString(` @@ -337,14 +338,12 @@ func TestLib(t *testing.T) { footer="0.3"/> `) worksheet := new(xlsxWorksheet) - err := xml.NewDecoder(sheetxml).Decode(worksheet) - c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) + err = xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) sheet, err := NewSheet("test") c.Assert(err, qt.IsNil) lt := make(hyperlinkTable) @@ -433,12 +432,10 @@ func TestLib(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) @@ -486,13 +483,11 @@ func TestLib(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) lt := make(hyperlinkTable) @@ -568,13 +563,11 @@ func TestLib(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) lt := make(hyperlinkTable) @@ -717,12 +710,11 @@ func TestLib(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) + file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) lt := make(hyperlinkTable) @@ -764,13 +756,10 @@ func TestLib(t *testing.T) { err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) - file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) lt := make(hyperlinkTable) @@ -882,12 +871,11 @@ func TestLib(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) + sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) @@ -964,12 +952,11 @@ func TestLib(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) + sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) lt := make(hyperlinkTable) @@ -1043,12 +1030,11 @@ func TestLib(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) + sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) lt := make(hyperlinkTable) @@ -1334,13 +1320,10 @@ func TestLib(t *testing.T) { err := xml.NewDecoder(sheetXML).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) - file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) @@ -1433,12 +1416,11 @@ func TestReadRowsFromSheet(t *testing.T) { worksheet := new(xlsxWorksheet) err := xml.NewDecoder(sheetxml).Decode(worksheet) c.Assert(err, qt.IsNil) - sst := new(xlsxSST) - err = xml.NewDecoder(sharedstringsXML).Decode(sst) - c.Assert(err, qt.IsNil) file := new(File) file.cellStoreConstructor = constructor - file.referenceTable = MakeSharedStringRefTable(sst) + file.referenceTable, err = readSharedStrings(sharedstringsXML) + c.Assert(err, qt.IsNil) + worksheet.mapMergeCells() sheet, err := NewSheetWithCellStore("test", constructor) c.Assert(err, qt.IsNil) diff --git a/reftable.go b/reftable.go index 0195b3b2..ae90f4ee 100644 --- a/reftable.go +++ b/reftable.go @@ -1,5 +1,7 @@ package xlsx +const DEFAULT_REFTABLE_SIZE = 500 + type plainTextOrRichText struct { plainText string isRichText bool @@ -8,8 +10,7 @@ type plainTextOrRichText struct { type RefTable struct { indexedStrings []plainTextOrRichText - knownStrings map[string]int - knownRichTexts map[string][]int + knownStrings map[string][]int isWrite bool } @@ -17,28 +18,27 @@ type RefTable struct { func NewSharedStringRefTable(size int) *RefTable { rt := RefTable{} rt.indexedStrings = make([]plainTextOrRichText, 0, size) - rt.knownStrings = make(map[string]int, size) - rt.knownRichTexts = make(map[string][]int, size) + rt.knownStrings = make(map[string][]int, size) return &rt } -// MakeSharedStringRefTable takes an xlsxSST struct and converts -// it's contents to an slice of strings used to refer to string values -// by numeric index - this is the model used within XLSX worksheet (a -// numeric reference is stored to a shared cell value). -func MakeSharedStringRefTable(source *xlsxSST) *RefTable { - reftable := NewSharedStringRefTable(len(source.SI)) - reftable.isWrite = false - for _, si := range source.SI { - if len(si.R) > 0 { - richText := xmlToRichText(si.R) - reftable.AddRichText(richText) - } else { - reftable.AddString(si.T.getText()) - } - } - return reftable -} +// // MakeSharedStringRefTable takes an xlsxSST struct and converts +// // it's contents to an slice of strings used to refer to string values +// // by numeric index - this is the model used within XLSX worksheet (a +// // numeric reference is stored to a shared cell value). +// func MakeSharedStringRefTable(source *xlsxSST) *RefTable { +// reftable := NewSharedStringRefTable(len(source.SI)) +// reftable.isWrite = false +// for _, si := range source.SI { +// if len(si.R) > 0 { +// richText := xmlToRichText(si.R) +// reftable.AddRichText(richText) +// } else { +// reftable.AddString(si.T.getText()) +// } +// } +// return reftable +// } // makeXlsxSST takes a RefTable and returns and // equivalent xlsxSST representation. @@ -77,15 +77,19 @@ func (rt *RefTable) ResolveSharedString(index int) (plainText string, richText [ // the existing index. func (rt *RefTable) AddString(str string) int { if rt.isWrite { - index, ok := rt.knownStrings[str] + indices, ok := rt.knownStrings[str] if ok { - return index + for _, index := range indices { + if !rt.indexedStrings[index].isRichText { + return index + } + } } } ptrt := plainTextOrRichText{plainText: str, isRichText: false} rt.indexedStrings = append(rt.indexedStrings, ptrt) index := len(rt.indexedStrings) - 1 - rt.knownStrings[str] = index + rt.knownStrings[str] = append(rt.knownStrings[str], index) return index } @@ -95,10 +99,10 @@ func (rt *RefTable) AddString(str string) int { func (rt *RefTable) AddRichText(r []RichTextRun) int { plain := richTextToPlainText(r) if rt.isWrite { - indices, ok := rt.knownRichTexts[plain] + indices, ok := rt.knownStrings[plain] if ok { for _, index := range indices { - if areRichTextsEqual(rt.indexedStrings[index].richText, r) { + if rt.indexedStrings[index].isRichText && areRichTextsEqual(rt.indexedStrings[index].richText, r) { return index } } @@ -108,7 +112,7 @@ func (rt *RefTable) AddRichText(r []RichTextRun) int { ptrt.richText = append(ptrt.richText, r...) rt.indexedStrings = append(rt.indexedStrings, ptrt) index := len(rt.indexedStrings) - 1 - rt.knownRichTexts[plain] = append(rt.knownRichTexts[plain], index) + rt.knownStrings[plain] = append(rt.knownStrings[plain], index) return index } diff --git a/reftable_test.go b/reftable_test.go index 9e8d537f..f15ca417 100644 --- a/reftable_test.go +++ b/reftable_test.go @@ -75,11 +75,11 @@ func TestCreateNewSharedStringRefTable(t *testing.T) { // using xlsx.MakeSharedStringRefTable(). func TestMakeSharedStringRefTable(t *testing.T) { c := qt.New(t) - sst := new(xlsxSST) sharedStringsXML := bytes.NewBufferString(reftabletest_sharedStringsXMLStr) - err := xml.NewDecoder(sharedStringsXML).Decode(sst) + + reftable, err := readSharedStrings(sharedStringsXML) c.Assert(err, qt.IsNil) - reftable := MakeSharedStringRefTable(sst) + c.Assert(reftable.Length(), qt.Equals, 5) p, r := reftable.ResolveSharedString(0) c.Assert(p, qt.Equals, "Foo") @@ -106,11 +106,10 @@ func TestMakeSharedStringRefTable(t *testing.T) { // table to a string value using RefTable.ResolveSharedString(). func TestResolveSharedString(t *testing.T) { c := qt.New(t) - sst := new(xlsxSST) sharedStringsXML := bytes.NewBufferString(reftabletest_sharedStringsXMLStr) - err := xml.NewDecoder(sharedStringsXML).Decode(sst) + reftable, err := readSharedStrings(sharedStringsXML) c.Assert(err, qt.IsNil) - reftable := MakeSharedStringRefTable(sst) + p, r := reftable.ResolveSharedString(0) c.Assert(p, qt.Equals, "Foo") c.Assert(r, qt.IsNil)