Skip to content

Commit

Permalink
Revert "unicode: Don't special case ignorable code points"
Browse files Browse the repository at this point in the history
This reverts commit 5c26d2f.

It turns out that we can't do this, because while the old behavior of
ignoring ignorable code points was most definitely wrong, we have
case-folding filesystems with on-disk hash values with that wrong
behavior.

So now you can't look up those names, because they hash to something
different.

Of course, it's also entirely possible that in the meantime people have
created *new* files with the new ("more correct") case folding logic,
and reverting will just make other things break.

The correct solution is to not do case folding in filesystems, but
sadly, people seem to never really understand that.  People still see it
as a feature, not a bug.

Reported-by: Qi Han <[email protected]>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219586
Cc: Gabriel Krisman Bertazi <[email protected]>
Requested-by: Jaegeuk Kim <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
torvalds committed Dec 11, 2024
1 parent ec8e2d3 commit 231825b
Show file tree
Hide file tree
Showing 2 changed files with 3,427 additions and 3,346 deletions.
70 changes: 70 additions & 0 deletions fs/unicode/mkutf8data.c
Original file line number Diff line number Diff line change
Expand Up @@ -2230,6 +2230,75 @@ static void nfdicf_init(void)
file_fail(fold_name);
}

static void ignore_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int first;
unsigned int last;
unsigned int *um;
int count;
int ret;

if (verbose > 0)
printf("Parsing %s\n", prop_name);
file = fopen(prop_name, "r");
if (!file)
open_fail(prop_name, errno);
assert(file);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
if (ret == 3) {
if (strcmp(buf0, "Default_Ignorable_Code_Point"))
continue;
if (!utf32valid(first) || !utf32valid(last))
line_fail(prop_name, line);
for (unichar = first; unichar <= last; unichar++) {
free(unicode_data[unichar].utf32nfdi);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
count++;
}
if (verbose > 1)
printf(" %X..%X Default_Ignorable_Code_Point\n",
first, last);
continue;
}
ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
if (ret == 2) {
if (strcmp(buf0, "Default_Ignorable_Code_Point"))
continue;
if (!utf32valid(unichar))
line_fail(prop_name, line);
free(unicode_data[unichar].utf32nfdi);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
if (verbose > 1)
printf(" %X Default_Ignorable_Code_Point\n",
unichar);
count++;
continue;
}
}
fclose(file);

if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(prop_name);
}

static void corrections_init(void)
{
FILE *file;
Expand Down Expand Up @@ -3342,6 +3411,7 @@ int main(int argc, char *argv[])
ccc_init();
nfdi_init();
nfdicf_init();
ignore_init();
corrections_init();
hangul_decompose();
nfdi_decompose();
Expand Down
Loading

0 comments on commit 231825b

Please sign in to comment.