Skip to content

Commit

Permalink
Optimized unescape
Browse files Browse the repository at this point in the history
  • Loading branch information
Chlumsky committed Apr 23, 2023
1 parent 4b2746e commit c8964ee
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 136 deletions.
82 changes: 38 additions & 44 deletions generated/ConfigurationParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,47 +110,6 @@ void ConfigurationParser::readHexQuad(int &value) {
cur += 4;
}

void ConfigurationParser::unescape(char *codepoints) {
switch (++cur, *cur++) {
case '\0':
--cur;
throw Error::UNEXPECTED_END_OF_FILE;
case 'B': case 'b': codepoints[0] = '\b'; break;
case 'F': case 'f': codepoints[0] = '\f'; break;
case 'N': case 'n': codepoints[0] = '\n'; break;
case 'R': case 'r': codepoints[0] = '\r'; break;
case 'T': case 't': codepoints[0] = '\t'; break;
case 'U': case 'u': {
unsigned long cp;
int wc;
readHexQuad(wc);
if ((wc&0xfc00) == 0xd800) {
if (!(cur[0] == '\\' && (cur[1] == 'u' || cur[1] == 'U')))
throw Error::UTF16_ENCODING_ERROR;
cp = (unsigned long) ((wc&0x03ff)<<10);
cur += 2;
readHexQuad(wc);
if ((wc&0xfc00) != 0xdc00)
throw Error::UTF16_ENCODING_ERROR;
cp = 0x010000+(cp|(unsigned long) (wc&0x03ff));
} else
cp = (unsigned long) wc;
if (cp&0xffffff80) {
int len;
for (len = 1; cp>>(5*len+1) && len < 6; ++len);
codepoints[0] = (char) (0xff<<(8-len)|cp>>6*(len-1));
for (int i = 1; i < len; ++i)
*++codepoints = (char) (0x80|(cp>>6*(len-i-1)&0x3f));
} else
codepoints[0] = (char) cp;
break;
}
default:
codepoints[0] = cur[-1];
}
codepoints[1] = '\0';
}

bool ConfigurationParser::isAlphanumeric(char c) {
switch (c) {
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I':
Expand Down Expand Up @@ -204,9 +163,44 @@ void ConfigurationParser::parseStdString(std::string &value) {
value.clear();
while (*cur != '"') {
if (*cur == '\\') {
char utfBuffer[8];
unescape(utfBuffer);
value += utfBuffer;
++cur;
switch (*cur++) {
case '\0':
--cur;
throw Error::UNEXPECTED_END_OF_FILE;
case 'B': case 'b': value.push_back('\b'); break;
case 'F': case 'f': value.push_back('\f'); break;
case 'N': case 'n': value.push_back('\n'); break;
case 'R': case 'r': value.push_back('\r'); break;
case 'T': case 't': value.push_back('\t'); break;
case 'U': case 'u': {
unsigned long cp;
int wc;
readHexQuad(wc);
if ((wc&0xfc00) == 0xd800) {
if (!(cur[0] == '\\' && (cur[1] == 'u' || cur[1] == 'U')))
throw Error::UTF16_ENCODING_ERROR;
cp = (unsigned long) ((wc&0x03ff)<<10);
cur += 2;
readHexQuad(wc);
if ((wc&0xfc00) != 0xdc00)
throw Error::UTF16_ENCODING_ERROR;
cp = 0x010000+(cp|(unsigned long) (wc&0x03ff));
} else
cp = (unsigned long) wc;
if (cp&0xffffff80) {
int len;
for (len = 1; cp>>(5*len+1) && len < 6; ++len);
value.push_back((char) (0xff<<(8-len)|cp>>6*(len-1)));
for (int i = 1; i < len; ++i)
value.push_back((char) (0x80|(cp>>6*(len-i-1)&0x3f)));
} else
value.push_back((char) cp);
break;
}
default:
value.push_back(cur[-1]);
}
continue;
}
if (!*cur)
Expand Down
85 changes: 0 additions & 85 deletions src/ParserGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,50 +75,6 @@ bool $::readHexQuad(int &value) {
(cur += 4, true)
);
}
$::Error::Type $::unescape(char *codepoints) {
switch (++cur, *cur++) {
case '\0':
--cur;
return Error::UNEXPECTED_END_OF_FILE;
case 'B': case 'b': codepoints[0] = '\b'; break;
case 'F': case 'f': codepoints[0] = '\f'; break;
case 'N': case 'n': codepoints[0] = '\n'; break;
case 'R': case 'r': codepoints[0] = '\r'; break;
case 'T': case 't': codepoints[0] = '\t'; break;
case 'U': case 'u': {
unsigned long cp;
int wc;
if (!readHexQuad(wc))
return Error::JSON_SYNTAX_ERROR;
if ((wc&0xfc00) == 0xd800) {
if (!(cur[0] == '\\' && (cur[1] == 'u' || cur[1] == 'U')))
return Error::UTF16_ENCODING_ERROR;
cp = (unsigned long) ((wc&0x03ff)<<10);
cur += 2;
if (!readHexQuad(wc))
return Error::JSON_SYNTAX_ERROR;
if ((wc&0xfc00) != 0xdc00)
return Error::UTF16_ENCODING_ERROR;
cp = 0x010000+(cp|(unsigned long) (wc&0x03ff));
} else
cp = (unsigned long) wc;
if (cp&0xffffff80) {
int len;
for (len = 1; cp>>(5*len+1) && len < 6; ++len);
codepoints[0] = (char) (0xff<<(8-len)|cp>>6*(len-1));
for (int i = 1; i < len; ++i)
*++codepoints = (char) (0x80|(cp>>6*(len-i-1)&0x3f));
} else
codepoints[0] = (char) cp;
break;
}
default:
codepoints[0] = cur[-1];
}
codepoints[1] = '\0';
return Error::OK;
}
)";

static constexpr const char *const COMMON_FUNCTION_IMPL_THROW =
Expand Down Expand Up @@ -189,47 +145,6 @@ void $::readHexQuad(int &value) {
throw Error::JSON_SYNTAX_ERROR;
cur += 4;
}
void $::unescape(char *codepoints) {
switch (++cur, *cur++) {
case '\0':
--cur;
throw Error::UNEXPECTED_END_OF_FILE;
case 'B': case 'b': codepoints[0] = '\b'; break;
case 'F': case 'f': codepoints[0] = '\f'; break;
case 'N': case 'n': codepoints[0] = '\n'; break;
case 'R': case 'r': codepoints[0] = '\r'; break;
case 'T': case 't': codepoints[0] = '\t'; break;
case 'U': case 'u': {
unsigned long cp;
int wc;
readHexQuad(wc);
if ((wc&0xfc00) == 0xd800) {
if (!(cur[0] == '\\' && (cur[1] == 'u' || cur[1] == 'U')))
throw Error::UTF16_ENCODING_ERROR;
cp = (unsigned long) ((wc&0x03ff)<<10);
cur += 2;
readHexQuad(wc);
if ((wc&0xfc00) != 0xdc00)
throw Error::UTF16_ENCODING_ERROR;
cp = 0x010000+(cp|(unsigned long) (wc&0x03ff));
} else
cp = (unsigned long) wc;
if (cp&0xffffff80) {
int len;
for (len = 1; cp>>(5*len+1) && len < 6; ++len);
codepoints[0] = (char) (0xff<<(8-len)|cp>>6*(len-1));
for (int i = 1; i < len; ++i)
*++codepoints = (char) (0x80|(cp>>6*(len-i-1)&0x3f));
} else
codepoints[0] = (char) cp;
break;
}
default:
codepoints[0] = cur[-1];
}
codepoints[1] = '\0';
}
)";

std::string ParserGenerator::generateMatchKeyword(const char *keyword) {
Expand Down
59 changes: 52 additions & 7 deletions src/types/StringType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,65 @@ StringType::StringType(std::string &&name, const StringAPI &api) : Type(TypeName
this->api.appendStringLiteral = this->api.appendCStr;
}

std::string StringType::generateUnescapeBody(ParserGenerator *generator, const char *outputName, const std::string &indent) const {
std::string body;
body += indent+"++cur;\n";
body += indent+"switch (*cur++) {\n";
body += indent+INDENT "case '\\0':\n";
body += indent+INDENT INDENT "--cur;\n";
body += indent+INDENT INDENT+generator->generateErrorStatement(ParserGenerator::Error::UNEXPECTED_END_OF_FILE)+";\n";
body += indent+INDENT "case 'B': case 'b': "+generateAppendChar(outputName, "'\\b'")+"; break;\n";
body += indent+INDENT "case 'F': case 'f': "+generateAppendChar(outputName, "'\\f'")+"; break;\n";
body += indent+INDENT "case 'N': case 'n': "+generateAppendChar(outputName, "'\\n'")+"; break;\n";
body += indent+INDENT "case 'R': case 'r': "+generateAppendChar(outputName, "'\\r'")+"; break;\n";
body += indent+INDENT "case 'T': case 't': "+generateAppendChar(outputName, "'\\t'")+"; break;\n";
body += indent+INDENT "case 'U': case 'u': {\n";
body += indent+INDENT INDENT+"unsigned long cp;\n";
body += indent+INDENT INDENT+"int wc;\n";
if (generator->settings().noThrow) {
body += indent+INDENT INDENT+"if (!readHexQuad(wc))\n";
body += indent+INDENT INDENT INDENT+generator->generateErrorStatement(ParserGenerator::Error::JSON_SYNTAX_ERROR)+";\n";
} else
body += indent+INDENT INDENT+"readHexQuad(wc);\n";
body += indent+INDENT INDENT+"if ((wc&0xfc00) == 0xd800) {\n";
body += indent+INDENT INDENT INDENT+"if (!(cur[0] == '\\\\' && (cur[1] == 'u' || cur[1] == 'U')))\n";
body += indent+INDENT INDENT INDENT INDENT+generator->generateErrorStatement(ParserGenerator::Error::UTF16_ENCODING_ERROR)+";\n";
body += indent+INDENT INDENT INDENT+"cp = (unsigned long) ((wc&0x03ff)<<10);\n";
body += indent+INDENT INDENT INDENT+"cur += 2;\n";
if (generator->settings().noThrow) {
body += indent+INDENT INDENT INDENT+"if (!readHexQuad(wc))\n";
body += indent+INDENT INDENT INDENT INDENT+generator->generateErrorStatement(ParserGenerator::Error::JSON_SYNTAX_ERROR)+";\n";
} else
body += indent+INDENT INDENT INDENT+"readHexQuad(wc);\n";
body += indent+INDENT INDENT INDENT+"if ((wc&0xfc00) != 0xdc00)\n";
body += indent+INDENT INDENT INDENT INDENT+generator->generateErrorStatement(ParserGenerator::Error::UTF16_ENCODING_ERROR)+";\n";
body += indent+INDENT INDENT INDENT+"cp = 0x010000+(cp|(unsigned long) (wc&0x03ff));\n";
body += indent+INDENT INDENT+"} else\n";
body += indent+INDENT INDENT INDENT+"cp = (unsigned long) wc;\n";
body += indent+INDENT INDENT+"if (cp&0xffffff80) {\n";
body += indent+INDENT INDENT INDENT+"int len;\n";
body += indent+INDENT INDENT INDENT+"for (len = 1; cp>>(5*len+1) && len < 6; ++len);\n";
body += indent+INDENT INDENT INDENT+generateAppendChar(outputName, "(char) (0xff<<(8-len)|cp>>6*(len-1))")+";\n";
body += indent+INDENT INDENT INDENT+"for (int i = 1; i < len; ++i)\n";
body += indent+INDENT INDENT INDENT INDENT+generateAppendChar(outputName, "(char) (0x80|(cp>>6*(len-i-1)&0x3f))")+";\n";
body += indent+INDENT INDENT+"} else\n";
body += indent+INDENT INDENT INDENT+generateAppendChar(outputName, "(char) cp")+";\n";
body += indent+INDENT INDENT+"break;\n";
body += indent+INDENT "}\n";
body += indent+INDENT "default:\n";
body += indent+INDENT INDENT+generateAppendChar(outputName, "cur[-1]")+";\n";
body += indent+"}\n";
return body;
}

std::string StringType::generateParserFunctionBody(ParserGenerator *generator, const std::string &indent) const {
std::string body;
body += indent+"if (!matchSymbol('\"'))\n";
body += indent+INDENT+generator->generateErrorStatement(ParserGenerator::Error::STRING_EXPECTED)+";\n";
body += indent+generateClear("value")+";\n";
body += indent+"while (*cur != '\"') {\n";
body += indent+INDENT "if (*cur == '\\\\') {\n";
body += indent+INDENT INDENT "char utfBuffer[8];\n";
if (generator->settings().noThrow) {
body += indent+INDENT INDENT "if (Error error = unescape(utfBuffer))\n";
body += indent+INDENT INDENT INDENT "return error;\n";
} else
body += indent+INDENT INDENT "unescape(utfBuffer);\n";
body += indent+INDENT INDENT+generateAppendCStr("value", "utfBuffer")+";\n";
body += generateUnescapeBody(generator, "value", indent+INDENT INDENT);
body += indent+INDENT INDENT "continue;\n";
body += indent+INDENT "}\n";
body += indent+INDENT "if (!*cur)\n";
Expand Down
2 changes: 2 additions & 0 deletions src/types/StringType.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@ class StringType : public Type {
private:
StringAPI api;

std::string generateUnescapeBody(ParserGenerator *generator, const char *outputName, const std::string &indent) const;

};

0 comments on commit c8964ee

Please sign in to comment.