From 216ea1a2cbc0c82d59a88aa53b6af384e0cb663b Mon Sep 17 00:00:00 2001
From: gagolews <m.gagolewski@gmail.com>
Date: Sat, 22 May 2021 21:31:03 +1000
Subject: [PATCH] #420: %stri$% now uses the (untested) stri_sprintf

---
 NEWS                          |  11 +++-
 R/sprintf.R                   |  54 +++++++++-------
 devel/sphinx/conf.py          |   2 +-
 devel/tinytest/test-sprintf.R |  15 ++++-
 man/stri_sprintf.Rd           |   5 +-
 src/stri_sprintf.cpp          | 113 ++++++++++++++++++++++++----------
 6 files changed, 135 insertions(+), 65 deletions(-)

diff --git a/NEWS b/NEWS
index 51df264e9..9d9503fcd 100644
--- a/NEWS
+++ b/NEWS
@@ -5,12 +5,17 @@
 ## 1.6.3-devel (2021-xx-yy)
 
 * TODO ... [NEW FEATURE] #420: `stri_sprintf` (alias: `stri_string_format`)
-  is a Unicode-aware replacement for the base `sprintf`:
-  it adds a customised handling of `NA`s (on demand) and
-  computing field size based on code point width.
+  is a Unicode-aware replacement for and enhancement of the base `sprintf`:
+  it adds a customised handling of `NA`s (on demand),
+  computing field size based on code point width,
+  outputting substrings of at most given width,
+  variable width and precision (both at the same time), etc.
   Moreover, `stri_printf` can be used to display formatted strings
   conveniently.
 
+* [BACKWARD INCOMPATIBILITY] `%s$%` and `%stri$%` now use `stri_sprintf`
+  instead of `base::sprintf`.
+
 * TODO ... [NEW FEATURE] #434: `stri_datetime_format` and `stri_datetime_parse`
   is now also vectorised with respect to the `format` argument.
 
diff --git a/R/sprintf.R b/R/sprintf.R
index 7878e7c17..11701154e 100644
--- a/R/sprintf.R
+++ b/R/sprintf.R
@@ -35,8 +35,9 @@
 #' Format Strings
 #'
 #' @description
-#' A Unicode-aware replacement for the built-in \code{\link[base]{sprintf}}
-#' function. Moreover, \code{stri_printf} prints formatted strings.
+#' A Unicode-aware replacement for and enhancement of
+#' the built-in \code{\link[base]{sprintf}} function.
+#' Moreover, \code{stri_printf} prints formatted strings.
 #'
 #' @details
 #' Vectorized over \code{format} and all vectors passed via \code{...}.
@@ -191,27 +192,34 @@ stri_printf <- function(
     if (!is.list(e2))
         e2 <- list(e2)
 
-    # this is stringi, assure UTF-8 output and proper NA handling!
-    e1 <- stri_enc_toutf8(as.character(e1))
-    if (length(e1) == 0) return(character(0))
-
-    for (i in seq_along(e2)) {
-        stopifnot(is.atomic(e2[[i]]))  # factor is atomic
-        if (length(e2[[i]]) == 0) return(character(0))
-        if (is.character(e2[[i]]) || is.factor(e2[[i]])) {
-            e2[[i]] <- stri_enc_toutf8(e2[[i]])
-        }
-    }
-
-    ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2))))
-    # for the time being, let stri_paste determine NAs
-    # (it might be too greedy if there are unused strings)
-    which_na <- do.call(stri_paste, e2)
-    ret[is.na(which_na)] <- NA_character_
-
-    ret[is.na(e1)] <- NA_character_
-
-    ret
+    .Call(C_stri_sprintf, e1, e2,
+        na_string=NA_character_,
+        inf_string="Inf",
+        nan_string="NaN",
+        use_length=FALSE)
+
+#  old version: based on base::sprintf
+#     # this is stringi, assure UTF-8 output and proper NA handling!
+#     e1 <- stri_enc_toutf8(as.character(e1))
+#     if (length(e1) == 0) return(character(0))
+#
+#     for (i in seq_along(e2)) {
+#         stopifnot(is.atomic(e2[[i]]))  # factor is atomic
+#         if (length(e2[[i]]) == 0) return(character(0))
+#         if (is.character(e2[[i]]) || is.factor(e2[[i]])) {
+#             e2[[i]] <- stri_enc_toutf8(e2[[i]])
+#         }
+#     }
+#
+#     ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2))))
+#     # for the time being, let stri_paste determine NAs
+#     # (it might be too greedy if there are unused strings)
+#     which_na <- do.call(stri_paste, e2)
+#     ret[is.na(which_na)] <- NA_character_
+#
+#     ret[is.na(e1)] <- NA_character_
+#
+#     ret
 }
 
 
diff --git a/devel/sphinx/conf.py b/devel/sphinx/conf.py
index d541b2f0d..ec8e3a1df 100644
--- a/devel/sphinx/conf.py
+++ b/devel/sphinx/conf.py
@@ -17,7 +17,7 @@
 html_short_title = project
 
 # The full version, including alpha/beta/rc tags
-version = '1.6.2'  # TODO: automate
+version = '1.6.3'  # TODO: automate
 release = version
 
 github_project_url = "https://github.com/gagolews/stringi/"
diff --git a/devel/tinytest/test-sprintf.R b/devel/tinytest/test-sprintf.R
index e5dd1f1ae..9b56b7862 100644
--- a/devel/tinytest/test-sprintf.R
+++ b/devel/tinytest/test-sprintf.R
@@ -14,7 +14,8 @@ expect_error(suppressWarnings(stri_sprintf("%-3$s", "a")))
 expect_identical(stri_sprintf("%%"), "%")
 expect_error(suppressWarnings(stri_sprintf("abc%")))
 
-stri_sprintf("%0000000000000000001$#- *0000002$.*003$f", 1.23456, -10, -3)
+stringi::stri_sprintf("%0000000000000000001$#0+ *0000002$.*003$e", 1.23456, -12, 3)
+stringi::stri_sprintf("%0000000000000000001$#0+ *0000002$.*003$e", 1.23456, 12, 3)
 
 # sprintf("%10.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_))
 # sprintf("%010.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_))
@@ -35,10 +36,18 @@ stringi::stri_sprintf("%.*s", -1:8, x)
 stringi::stri_sprintf("%.*s", -1:8, x, use_length=TRUE)
 
 x <- "\u200b\u200b\u200b\u200b\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007Fabcd"
-stringi::stri_sprintf("%.*s", -1:8, x)
-stringi::stri_sprintf("%.*s", -1:8, x, use_length=TRUE)
+stringi::stri_sprintf("%4.*s", -1:8, x)
+stringi::stri_sprintf("%4.*s", -1:8, x, use_length=TRUE)
 
 
+x <- "\u200b\u200b\u200b\u200b\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007Fabcd"
+stringi::stri_sprintf("[%-16.*s]", -1:8, x)
+stringi::stri_sprintf("[%16.*s]", -1:8, x)
+
+stringi::stri_sprintf("[%*.*s]", 1:8, 1:8, x)
+stringi::stri_sprintf("[%-*.*s]", 1:8, 1:8, x)
+stringi::stri_sprintf("[%*s]", 1:8, x)
+stringi::stri_sprintf("[%-*s]", 1:8, x)
 
 '
 sprintf("%2$s", 1, 2)  # warning - unsused arg
diff --git a/man/stri_sprintf.Rd b/man/stri_sprintf.Rd
index 8779ed219..ee90ca677 100644
--- a/man/stri_sprintf.Rd
+++ b/man/stri_sprintf.Rd
@@ -50,8 +50,9 @@ text on the standard output or other connection/file. Hence, it returns
 The other functions return a character vector.
 }
 \description{
-A Unicode-aware replacement for the built-in \code{\link[base]{sprintf}}
-function. Moreover, \code{stri_printf} prints formatted strings.
+A Unicode-aware replacement for and enhancement of
+the built-in \code{\link[base]{sprintf}} function.
+Moreover, \code{stri_printf} prints formatted strings.
 }
 \details{
 Vectorized over \code{format} and all vectors passed via \code{...}.
diff --git a/src/stri_sprintf.cpp b/src/stri_sprintf.cpp
index 33ae17eec..7be316f49 100644
--- a/src/stri_sprintf.cpp
+++ b/src/stri_sprintf.cpp
@@ -76,6 +76,17 @@ typedef enum {
 } StriSprintfType;
 
 
+/** data types for sprintf
+ *
+ * @version 1.6.3 (Marek Gagolewski, 2021-05-20)
+ */
+typedef enum {
+    STRI_SPRINTF_FORMAT_STATUS_OK=0,
+    STRI_SPRINTF_FORMAT_STATUS_IS_NA,
+    STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING
+} StriSprintfFormatStatus;
+
+
 /**
  * if delim found, stops right after delim, modifies jc in place
  * if delim not found, returns NA_INTEGER or throws an error
@@ -421,8 +432,9 @@ class StriSprintfFormatSpec
             // followed by '$' and the call below will return NA_INTEGER
             which_datum = stri__atoi_to_delim(
                 f, /*by reference*/jc, j0, j1, /*delimiter*/'$', false/*throw_error*/
-            )-1/*0-based indexing*/;
+            );
             // result can be NA_INTEGER; incorrect indexes will be caught by get*
+            if (which_datum != NA_INTEGER) which_datum--; /*0-based indexing*/
         }
 
         // 2. optional flags [ +0#-]
@@ -446,7 +458,8 @@ class StriSprintfFormatSpec
             if (f[jc] >= '0' && f[jc] <= '9') {
                 which_width = stri__atoi_to_delim(
                     f, /*by reference*/jc, j0, j1, /*delimiter*/'$'
-                )-1/*0-based indexing*/;
+                );
+                if (which_width != NA_INTEGER) which_width--; /*0-based indexing*/
             }
             min_width = data.getIntegerOrNA(which_width);
         }
@@ -470,7 +483,8 @@ class StriSprintfFormatSpec
                 if (f[jc] >= '0' && f[jc] <= '9') {
                     which_precision = stri__atoi_to_delim(
                         f, /*by reference*/jc, j0, j1, /*delimiter*/'$'
-                    )-1/*0-based indexing*/;
+                    );
+                    if (which_precision != NA_INTEGER) which_precision--; /*0-based indexing*/
                 }
                 precision = data.getIntegerOrNA(which_precision);
             }
@@ -546,45 +560,60 @@ class StriSprintfFormatSpec
     }
 
 
-    std::string formatDatum()
+    StriSprintfFormatStatus formatDatum(std::string& preformatted_datum)
     {
-        std::string preformatted_datum;
-        bool needs_padding;
+        StriSprintfFormatStatus status;
         if (type == STRI_SPRINTF_TYPE_INTEGER) {
             int datum = data.getIntegerOrNA(which_datum);
-            needs_padding = preformatDatum_doxX(preformatted_datum/*by reference*/, datum);
+            status = preformatDatum_doxX(preformatted_datum/*by reference*/, datum);
         }
         else if (type == STRI_SPRINTF_TYPE_DOUBLE) {
             double datum = data.getDoubleOrNA(which_datum);
-            needs_padding = preformatDatum_feEgGaA(preformatted_datum/*by reference*/, datum);
+            status = preformatDatum_feEgGaA(preformatted_datum/*by reference*/, datum);
         }
         else { // string
             const String8& datum = data.getStringOrNA(which_datum);
-            needs_padding = preformatDatum_s(preformatted_datum, datum);
+            status = preformatDatum_s(preformatted_datum, datum);
         }
 
-        if (!needs_padding)
-            return preformatted_datum;
+        if (status != STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING)
+            return status;
 
-        // now we need to pad with spaces from left or right up to min_width
-        // based on width or length (use_length)
+        if (min_width == NA_INTEGER)
+            return STRI_SPRINTF_FORMAT_STATUS_OK;
 
-        // btw: pad_from_right always add spaces
-        // btw: pad_zero   "-00000" "+00000" " 00000" "0x0000" "0X0000"
-        //     but not NA/Inf/... and only numerics,
-        //     and this needs_padding no more (already dealt with)
+        STRI_ASSERT(min_width > 0);
 
+        R_len_t datum_size;
+        if (use_length)  // number of code points
+            datum_size = stri__length_string(preformatted_datum.c_str(),
+                                             preformatted_datum.length());
+        else
+            datum_size = stri__width_string(preformatted_datum.c_str(),
+                                            preformatted_datum.length());
 
-//         if (use_length) width = str.countCodePoints();
-//         else width = stri__width_string(str.c_str(), str.length())
+        if (datum_size < min_width) {
+            // now we need to pad with spaces from left or right up to min_width
+            // based on width or length (use_length)
 
-        return preformatted_datum; // TODO
+            // btw: pad_from_right always add spaces
+            // btw: pad_zero   "-00000" "+00000" " 00000" "0x0000" "0X0000"
+            //     but not NA/Inf/... and only numerics,
+            //     and this needs_padding no more (already dealt with)
+
+            if (pad_from_right)
+                preformatted_datum.append(min_width-datum_size, ' ');
+            else
+                preformatted_datum.assign(std::string(min_width-datum_size, ' ') + preformatted_datum);
+        }
+
+        return STRI_SPRINTF_FORMAT_STATUS_OK;
     }
 
 
 private:
 
-    bool preformatDatum_doxX(std::string& preformatted_datum, int datum)
+    StriSprintfFormatStatus preformatDatum_doxX(std::string& preformatted_datum, int datum)
     {
         STRI_ASSERT(type_spec != 'i');  // normalised i->d
         if (datum != NA_INTEGER) {
@@ -603,11 +632,15 @@ class StriSprintfFormatSpec
             snprintf(buf.data(), bufsize, format_string.c_str(), datum);
             preformatted_datum.append(buf.data());
 
-            return false;  /* all in ASCII, padding done by std::snprintf */
+            return STRI_SPRINTF_FORMAT_STATUS_OK;  /* all in ASCII, padding done by std::snprintf */
         }
         else {
             STRI_ASSERT(type_spec == 'd' || !sign_plus);
             STRI_ASSERT(type_spec == 'd' || !sign_space);
+
+            if (na_string.isNA())
+                return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
+
             if (sign_plus) {
                 // glibc produces "+nan", but we will output " nan" instead
                 preformatted_datum.push_back(' ');
@@ -617,12 +650,12 @@ class StriSprintfFormatSpec
             // else no sign
 
             preformatted_datum.append(na_string.c_str());
-            return true;  /* might need padding (na_string can be fancy Unicode) */
+            return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING;  /* might need padding (na_string can be fancy Unicode) */
         }
     }
 
 
-    bool preformatDatum_feEgGaA(std::string& preformatted_datum, double datum)
+    StriSprintfFormatStatus preformatDatum_feEgGaA(std::string& preformatted_datum, double datum)
     {
         if (R_FINITE(datum)) {
             STRI_ASSERT(min_width == NA_INTEGER || min_width >= 0);
@@ -640,7 +673,7 @@ class StriSprintfFormatSpec
             snprintf(buf.data(), bufsize, format_string.c_str(), datum);
             preformatted_datum.append(buf.data());
 
-            return false;  /* all in ASCII, padding done by std::snprintf */
+            return STRI_SPRINTF_FORMAT_STATUS_OK;  /* all in ASCII, padding done by std::snprintf */
         }
         else {
             if (ISNA(datum) || ISNAN(datum)) {
@@ -663,19 +696,28 @@ class StriSprintfFormatSpec
             }
 
             // alternate_output has no effect (use inf_string etc. instead)
-            if (ISNA(datum))
+            if (ISNA(datum)) {
+                if (na_string.isNA())
+                    return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
                 preformatted_datum.append(na_string.c_str());
-            else if (ISNAN(datum))
+            }
+            else if (ISNAN(datum)) {
+                if (nan_string.isNA())
+                    return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
                 preformatted_datum.append(nan_string.c_str());
-            else
+            }
+            else {
+                if (inf_string.isNA())
+                    return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
                 preformatted_datum.append(inf_string.c_str());
+            }
 
-            return true;  /* might need padding (na_string can be fancy Unicode) */
+            return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING;  /* might need padding (na_string can be fancy Unicode) */
         }
     }
 
 
-    bool preformatDatum_s(std::string& preformatted_datum, const String8& datum)
+    StriSprintfFormatStatus preformatDatum_s(std::string& preformatted_datum, const String8& datum)
     {
         STRI_ASSERT(!pad_zero);
         STRI_ASSERT(!sign_plus);
@@ -684,7 +726,7 @@ class StriSprintfFormatSpec
         STRI_ASSERT(precision == NA_INTEGER || precision >= 0);
 
         if (!datum.isNA()) {
-            R_len_t datum_size = datum.length();
+            R_len_t datum_size = datum.length();  // this is byte count
             if (precision != NA_INTEGER) {
                 if (use_length) {
                     // ha! output no more than <precision> code points
@@ -698,10 +740,13 @@ class StriSprintfFormatSpec
             preformatted_datum.append(datum.c_str(), datum_size);
         }
         else { // isNA
+            if (na_string.isNA())
+                return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
+
             preformatted_datum.append(na_string.c_str());
         }
 
-        return true;  /* might need padding */
+        return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING;  /* might need padding */
     }
 };
 
@@ -753,7 +798,9 @@ SEXP stri__sprintf_1(
         // debug: Rprintf("*** spec=%s\n", spec.toString().c_str());
         // debug: buf.append(spec.toString());
 
-        std::string formatted_datum = spec.formatDatum();
+        std::string formatted_datum;
+        if (spec.formatDatum(formatted_datum) == STRI_SPRINTF_FORMAT_STATUS_IS_NA)
+            return NA_STRING;
 
         buf.append(formatted_datum);
     }