Skip to content

Commit

Permalink
better if probably fragile detection of Turkish text encoding ISO-885…
Browse files Browse the repository at this point in the history
…9-9 v1.8.5
  • Loading branch information
raymondben committed Nov 30, 2024
1 parent bfb783e commit aa112ee
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 2 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: datavolley
Title: Reading and Analyzing DataVolley Scout Files
Version: 1.8.4
Version: 1.8.5
Authors@R: c(person("Ben", "Raymond", email = "[email protected]", role = c("aut", "cre")),
person("Adrien", "Ickowicz", role = "aut"),
person("Tyler", "Widdison", role = "aut"),
Expand Down
2 changes: 1 addition & 1 deletion R/read_dv.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ dv_read <- function(filename, insert_technical_timeouts=TRUE, do_warn=FALSE, do_
##if (any(xiso_idx))
## encoding <- c(encoding,gsub("^x\\-iso","iso",encoding[xiso_idx]))
## add common ones
encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1
encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251", "iso-8859-9")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1
encoding <- encoding[tolower(encoding) %in% tolower(iconvlist())]
##if (length(encoding)<=1) encoding <- iconvlist()
}
Expand Down
3 changes: 3 additions & 0 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,9 @@ get_best_encodings <- function(encodings_to_test, filename, read_from = 10, read
badwords <- c(badwords, tolower(c("\uc9\u57\uc9", "\ue2\u2122"))) ## cp932 wrongly detected as macintosh
badwords <- c(badwords, c("\ufd\u79")) ## windows-1254 wrongly detected as 1250
badwords <- c(badwords, c("\u6e\u434\u45a\u69\u434\u45a", "\u76\u69\u434\u45a")) ## "ncic" and "vic" but c with caron (Serbian/Czech/etc) wrongly detected as cyrillic
badwords <- c(badwords, c("\u79\u69\uf0\u69\u74", "\u75\u6c\u61\ufe", "\u64\u6f\uf0\u61\u6e", "\u70\u6f\uf0\u6c\u75", "\u6e\ufd\u6c\ufc\u66\u65\u72", ## 1252
"\u64\u6f\u111\u61\u6e", "\u75\u6c\u61\u163", ## 1250
"\u62\u44c\u6c", "\u44c\u6d\u69\u74", "\u62\u65\u6c\u65\u64\u44d\u79", "\u6e\u44d\u6c\u44c\u66\u65\u72")) ## 1251 - Turkish iso-8859-9 wrongly detected as windows-1252 or 1250 or 1251. Need to be a little careful because some characters (e.g. \uf0, \ufe) are valid in e.g. Icelandic and Faroese
badwords_trans <- c("oooo", "ouuoo", "oouoo", "uuou", "uuoo") ## badwords after transliteration, e.g. wrongly-detected cyrillic
## get the \uxx numbers from sprintf("%x",utf8ToInt(tolower(dodgy_string_or_char))) or paste0("\\u", sprintf("%x", utf8ToInt(tolower("dodgy"))), collapse = "")
read_with_enc <- function(filename, enc_to_test) {
Expand Down

0 comments on commit aa112ee

Please sign in to comment.