diff --git a/xlsxwriter/sharedstrings.py b/xlsxwriter/sharedstrings.py index fc881efb2..167d1683b 100644 --- a/xlsxwriter/sharedstrings.py +++ b/xlsxwriter/sharedstrings.py @@ -6,17 +6,10 @@ # Copyright 2013-2023, John McNamara, jmcnamara@cpan.org # -# Standard packages. -import re - # Package imports. from . import xmlwriter from .utility import preserve_whitespace -# Compile performance critical regular expressions. -re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)") -re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])") - class SharedStrings(xmlwriter.XMLwriter): """ @@ -92,22 +85,8 @@ def _write_si(self, string): # Write the element. attributes = [] - # Excel escapes control characters with _xHHHH_ and also escapes any - # literal strings of that type by encoding the leading underscore. - # So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_. - # The following substitutions deal with those cases. - - # Escape the escape. - string = re_control_chars_1.sub(r"_x005F\1", string) - - # Convert control character to the _xHHHH_ escape. - string = re_control_chars_2.sub( - lambda match: "_x%04X_" % ord(match.group(1)), string - ) - - # Escapes non characters in strings. - string = string.replace("\uFFFE", "_xFFFE_") - string = string.replace("\uFFFF", "_xFFFF_") + # Convert control character to a _xHHHH_ escape. + string = self._escape_control_characters(string) # Add attribute to preserve leading or trailing whitespace. if preserve_whitespace(string): diff --git a/xlsxwriter/test/comparison/test_escapes09.py b/xlsxwriter/test/comparison/test_escapes09.py new file mode 100644 index 000000000..633ae3ee3 --- /dev/null +++ b/xlsxwriter/test/comparison/test_escapes09.py @@ -0,0 +1,50 @@ +############################################################################### +# +# Tests for XlsxWriter. +# +# SPDX-License-Identifier: BSD-2-Clause +# Copyright (c), 2013-2023, John McNamara, jmcnamara@cpan.org +# + +from ..excel_comparison_test import ExcelComparisonTest +from ...workbook import Workbook + + +class TestCompareXLSXFiles(ExcelComparisonTest): + """ + Test file created by XlsxWriter against a file created by Excel. + + """ + + def setUp(self): + self.set_filename("escapes09.xlsx") + + def test_create_file(self): + """Test the creation of a simple XlsxWriter file.""" + + workbook = Workbook(self.got_filename) + + worksheet = workbook.add_worksheet() + chart = workbook.add_chart({"type": "line"}) + + chart.axis_ids = [52721920, 53133312] + + worksheet.write(0, 0, "Data\x1b[32m1") + worksheet.write(1, 0, "Data\x1b[32m2") + worksheet.write(2, 0, "Data\x1b[32m3") + worksheet.write(3, 0, "Data\x1b[32m4") + + worksheet.write(0, 1, 10) + worksheet.write(1, 1, 20) + worksheet.write(2, 1, 10) + worksheet.write(3, 1, 30) + + chart.add_series( + {"categories": "=Sheet1!$A$1:$A$4", "values": "=Sheet1!$B$1:$B$4"} + ) + + worksheet.insert_chart("E9", chart) + + workbook.close() + + self.assertExcelEqual() diff --git a/xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx b/xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx new file mode 100644 index 000000000..c88955780 Binary files /dev/null and b/xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx differ diff --git a/xlsxwriter/worksheet.py b/xlsxwriter/worksheet.py index e5a17022b..31453c77b 100644 --- a/xlsxwriter/worksheet.py +++ b/xlsxwriter/worksheet.py @@ -44,10 +44,6 @@ from .exceptions import DuplicateTableName from .exceptions import OverlappingRange -# Compile performance critical regular expressions. -re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)") -re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])") - re_dynamic_function = re.compile( r""" \bANCHORARRAY\( | @@ -6781,15 +6777,8 @@ def _write_cell(self, row, col, cell): else: # Write an optimized in-line string. - # Escape control characters. See SharedString.pm for details. - string = re_control_chars_1.sub(r"_x005F\1", string) - string = re_control_chars_2.sub( - lambda match: "_x%04X_" % ord(match.group(1)), string - ) - - # Escapes non characters in strings. - string = string.replace("\uFFFE", "_xFFFE_") - string = string.replace("\uFFFF", "_xFFFF_") + # Convert control character to a _xHHHH_ escape. + string = self._escape_control_characters(string) # Write any rich strings without further tags. if string.startswith("") and string.endswith(""): diff --git a/xlsxwriter/xmlwriter.py b/xlsxwriter/xmlwriter.py index 28b8a0f9c..d0c480428 100644 --- a/xlsxwriter/xmlwriter.py +++ b/xlsxwriter/xmlwriter.py @@ -12,6 +12,11 @@ import re from io import StringIO +# Compile performance critical regular expressions. +re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)") +re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])") +xml_escapes = re.compile('["&<>\n]') + class XMLwriter(object): """ @@ -21,7 +26,6 @@ class XMLwriter(object): def __init__(self): self.fh = None - self.escapes = re.compile('["&<>\n]') self.internal_fh = False def _set_filehandle(self, filehandle): @@ -94,6 +98,8 @@ def _xml_data_element(self, tag, data, attributes=[]): tag += ' %s="%s"' % (key, value) data = self._escape_data(data) + data = self._escape_control_characters(data) + self.fh.write("<%s>%s" % (tag, data, end_tag)) def _xml_string_element(self, index, attributes=[]): @@ -178,7 +184,7 @@ def _xml_rich_inline_string(self, string, attributes=[]): def _escape_attributes(self, attribute): # Escape XML characters in attributes. try: - if not self.escapes.search(attribute): + if not xml_escapes.search(attribute): return attribute except TypeError: return attribute @@ -197,10 +203,32 @@ def _escape_data(self, data): # is different from _escape_attributes() in that double quotes # are not escaped by Excel. try: - if not self.escapes.search(data): + if not xml_escapes.search(data): return data except TypeError: return data data = data.replace("&", "&").replace("<", "<").replace(">", ">") return data + + @staticmethod + def _escape_control_characters(data): + # Excel escapes control characters with _xHHHH_ and also escapes any + # literal strings of that type by encoding the leading underscore. + # So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_. + # The following substitutions deal with those cases. + try: + # Escape the escape. + data = re_control_chars_1.sub(r"_x005F\1", data) + except TypeError: + return data + + # Convert control character to the _xHHHH_ escape. + data = re_control_chars_2.sub( + lambda match: "_x%04X_" % ord(match.group(1)), data + ) + + # Escapes non characters in strings. + data = data.replace("\uFFFE", "_xFFFE_").replace("\uFFFF", "_xFFFF_") + + return data