From aa112ee2e0553ad322b3f3f12218a8597146d6dc Mon Sep 17 00:00:00 2001 From: Ben Raymond Date: Sun, 1 Dec 2024 10:18:44 +1100 Subject: [PATCH] better if probably fragile detection of Turkish text encoding ISO-8859-9 v1.8.5 --- DESCRIPTION | 2 +- R/read_dv.R | 2 +- R/util.R | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1250c4d..6440e36 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: datavolley Title: Reading and Analyzing DataVolley Scout Files -Version: 1.8.4 +Version: 1.8.5 Authors@R: c(person("Ben", "Raymond", email = "ben@untan.gl", role = c("aut", "cre")), person("Adrien", "Ickowicz", role = "aut"), person("Tyler", "Widdison", role = "aut"), diff --git a/R/read_dv.R b/R/read_dv.R index 9ea4222..0a5ee00 100644 --- a/R/read_dv.R +++ b/R/read_dv.R @@ -143,7 +143,7 @@ dv_read <- function(filename, insert_technical_timeouts=TRUE, do_warn=FALSE, do_ ##if (any(xiso_idx)) ## encoding <- c(encoding,gsub("^x\\-iso","iso",encoding[xiso_idx])) ## add common ones - encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1 + encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251", "iso-8859-9")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1 encoding <- encoding[tolower(encoding) %in% tolower(iconvlist())] ##if (length(encoding)<=1) encoding <- iconvlist() } diff --git a/R/util.R b/R/util.R index a6613ac..91c3209 100644 --- a/R/util.R +++ b/R/util.R @@ -372,6 +372,9 @@ get_best_encodings <- function(encodings_to_test, filename, read_from = 10, read badwords <- c(badwords, tolower(c("\uc9\u57\uc9", "\ue2\u2122"))) ## cp932 wrongly detected as macintosh badwords <- c(badwords, c("\ufd\u79")) ## windows-1254 wrongly detected as 1250 badwords <- c(badwords, c("\u6e\u434\u45a\u69\u434\u45a", "\u76\u69\u434\u45a")) ## "ncic" and "vic" but c with caron (Serbian/Czech/etc) wrongly detected as cyrillic + badwords <- c(badwords, c("\u79\u69\uf0\u69\u74", "\u75\u6c\u61\ufe", "\u64\u6f\uf0\u61\u6e", "\u70\u6f\uf0\u6c\u75", "\u6e\ufd\u6c\ufc\u66\u65\u72", ## 1252 + "\u64\u6f\u111\u61\u6e", "\u75\u6c\u61\u163", ## 1250 + "\u62\u44c\u6c", "\u44c\u6d\u69\u74", "\u62\u65\u6c\u65\u64\u44d\u79", "\u6e\u44d\u6c\u44c\u66\u65\u72")) ## 1251 - Turkish iso-8859-9 wrongly detected as windows-1252 or 1250 or 1251. Need to be a little careful because some characters (e.g. \uf0, \ufe) are valid in e.g. Icelandic and Faroese badwords_trans <- c("oooo", "ouuoo", "oouoo", "uuou", "uuoo") ## badwords after transliteration, e.g. wrongly-detected cyrillic ## get the \uxx numbers from sprintf("%x",utf8ToInt(tolower(dodgy_string_or_char))) or paste0("\\u", sprintf("%x", utf8ToInt(tolower("dodgy"))), collapse = "") read_with_enc <- function(filename, enc_to_test) {