better if probably fragile detection of Turkish text encoding ISO-885…

…9-9 v1.8.5
openvolley · Nov 30, 2024 · aa112ee · aa112ee
1 parent bfb783e
commit aa112ee
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: datavolley
 Title: Reading and Analyzing DataVolley Scout Files
-Version: 1.8.4
+Version: 1.8.5
 Authors@R: c(person("Ben", "Raymond", email = "[email protected]", role = c("aut", "cre")),
              person("Adrien", "Ickowicz", role = "aut"),
              person("Tyler", "Widdison", role = "aut"),

diff --git a/R/read_dv.R b/R/read_dv.R
@@ -143,7 +143,7 @@ dv_read <- function(filename, insert_technical_timeouts=TRUE, do_warn=FALSE, do_
             ##if (any(xiso_idx))
             ##    encoding <- c(encoding,gsub("^x\\-iso","iso",encoding[xiso_idx]))
             ## add common ones
-            encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1
+            encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251", "iso-8859-9")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1
             encoding <- encoding[tolower(encoding) %in% tolower(iconvlist())]
             ##if (length(encoding)<=1) encoding <- iconvlist()
         }

diff --git a/R/util.R b/R/util.R
@@ -372,6 +372,9 @@ get_best_encodings <- function(encodings_to_test, filename, read_from = 10, read
     badwords <- c(badwords, tolower(c("\uc9\u57\uc9", "\ue2\u2122"))) ## cp932 wrongly detected as macintosh
     badwords <- c(badwords, c("\ufd\u79")) ## windows-1254 wrongly detected as 1250
     badwords <- c(badwords, c("\u6e\u434\u45a\u69\u434\u45a", "\u76\u69\u434\u45a")) ## "ncic" and "vic" but c with caron (Serbian/Czech/etc) wrongly detected as cyrillic
+    badwords <- c(badwords, c("\u79\u69\uf0\u69\u74", "\u75\u6c\u61\ufe", "\u64\u6f\uf0\u61\u6e", "\u70\u6f\uf0\u6c\u75", "\u6e\ufd\u6c\ufc\u66\u65\u72", ## 1252
+                              "\u64\u6f\u111\u61\u6e", "\u75\u6c\u61\u163", ## 1250
+                              "\u62\u44c\u6c", "\u44c\u6d\u69\u74", "\u62\u65\u6c\u65\u64\u44d\u79", "\u6e\u44d\u6c\u44c\u66\u65\u72")) ## 1251 - Turkish iso-8859-9 wrongly detected as windows-1252 or 1250 or 1251. Need to be a little careful because some characters (e.g. \uf0, \ufe) are valid in e.g. Icelandic and Faroese
     badwords_trans <- c("oooo", "ouuoo", "oouoo", "uuou", "uuoo") ## badwords after transliteration, e.g. wrongly-detected cyrillic
     ## get the \uxx numbers from sprintf("%x",utf8ToInt(tolower(dodgy_string_or_char))) or paste0("\\u", sprintf("%x", utf8ToInt(tolower("dodgy"))), collapse = "")
     read_with_enc <- function(filename, enc_to_test) {