From aa112ee2e0553ad322b3f3f12218a8597146d6dc Mon Sep 17 00:00:00 2001
From: Ben Raymond <ben@untan.gl>
Date: Sun, 1 Dec 2024 10:18:44 +1100
Subject: [PATCH] better if probably fragile detection of Turkish text encoding
 ISO-8859-9 v1.8.5

---
 DESCRIPTION | 2 +-
 R/read_dv.R | 2 +-
 R/util.R    | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1250c4d..6440e36 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: datavolley
 Title: Reading and Analyzing DataVolley Scout Files
-Version: 1.8.4
+Version: 1.8.5
 Authors@R: c(person("Ben", "Raymond", email = "ben@untan.gl", role = c("aut", "cre")),
              person("Adrien", "Ickowicz", role = "aut"),
              person("Tyler", "Widdison", role = "aut"),
diff --git a/R/read_dv.R b/R/read_dv.R
index 9ea4222..0a5ee00 100644
--- a/R/read_dv.R
+++ b/R/read_dv.R
@@ -143,7 +143,7 @@ dv_read <- function(filename, insert_technical_timeouts=TRUE, do_warn=FALSE, do_
             ##if (any(xiso_idx))
             ##    encoding <- c(encoding,gsub("^x\\-iso","iso",encoding[xiso_idx]))
             ## add common ones
-            encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1
+            encoding <- c(encoding, c("windows-1252", "iso-8859-2", "windows-1250", "US-ASCII", "UTF-8", "SHIFT-JIS", "CP932", "windows-1251", "iso-8859-9")) ## windows-1252 should be used in preference to "iso-8859-1", see https://en.wikipedia.org/wiki/ISO/IEC_8859-1
             encoding <- encoding[tolower(encoding) %in% tolower(iconvlist())]
             ##if (length(encoding)<=1) encoding <- iconvlist()
         }
diff --git a/R/util.R b/R/util.R
index a6613ac..91c3209 100644
--- a/R/util.R
+++ b/R/util.R
@@ -372,6 +372,9 @@ get_best_encodings <- function(encodings_to_test, filename, read_from = 10, read
     badwords <- c(badwords, tolower(c("\uc9\u57\uc9", "\ue2\u2122"))) ## cp932 wrongly detected as macintosh
     badwords <- c(badwords, c("\ufd\u79")) ## windows-1254 wrongly detected as 1250
     badwords <- c(badwords, c("\u6e\u434\u45a\u69\u434\u45a", "\u76\u69\u434\u45a")) ## "ncic" and "vic" but c with caron (Serbian/Czech/etc) wrongly detected as cyrillic
+    badwords <- c(badwords, c("\u79\u69\uf0\u69\u74", "\u75\u6c\u61\ufe", "\u64\u6f\uf0\u61\u6e", "\u70\u6f\uf0\u6c\u75", "\u6e\ufd\u6c\ufc\u66\u65\u72", ## 1252
+                              "\u64\u6f\u111\u61\u6e", "\u75\u6c\u61\u163", ## 1250
+                              "\u62\u44c\u6c", "\u44c\u6d\u69\u74", "\u62\u65\u6c\u65\u64\u44d\u79", "\u6e\u44d\u6c\u44c\u66\u65\u72")) ## 1251 - Turkish iso-8859-9 wrongly detected as windows-1252 or 1250 or 1251. Need to be a little careful because some characters (e.g. \uf0, \ufe) are valid in e.g. Icelandic and Faroese
     badwords_trans <- c("oooo", "ouuoo", "oouoo", "uuou", "uuoo") ## badwords after transliteration, e.g. wrongly-detected cyrillic
     ## get the \uxx numbers from sprintf("%x",utf8ToInt(tolower(dodgy_string_or_char))) or paste0("\\u", sprintf("%x", utf8ToInt(tolower("dodgy"))), collapse = "")
     read_with_enc <- function(filename, enc_to_test) {