From 216ea1a2cbc0c82d59a88aa53b6af384e0cb663b Mon Sep 17 00:00:00 2001 From: gagolews Date: Sat, 22 May 2021 21:31:03 +1000 Subject: [PATCH] #420: %stri$% now uses the (untested) stri_sprintf --- NEWS | 11 +++- R/sprintf.R | 54 +++++++++------- devel/sphinx/conf.py | 2 +- devel/tinytest/test-sprintf.R | 15 ++++- man/stri_sprintf.Rd | 5 +- src/stri_sprintf.cpp | 113 ++++++++++++++++++++++++---------- 6 files changed, 135 insertions(+), 65 deletions(-) diff --git a/NEWS b/NEWS index 51df264e9..9d9503fcd 100644 --- a/NEWS +++ b/NEWS @@ -5,12 +5,17 @@ ## 1.6.3-devel (2021-xx-yy) * TODO ... [NEW FEATURE] #420: `stri_sprintf` (alias: `stri_string_format`) - is a Unicode-aware replacement for the base `sprintf`: - it adds a customised handling of `NA`s (on demand) and - computing field size based on code point width. + is a Unicode-aware replacement for and enhancement of the base `sprintf`: + it adds a customised handling of `NA`s (on demand), + computing field size based on code point width, + outputting substrings of at most given width, + variable width and precision (both at the same time), etc. Moreover, `stri_printf` can be used to display formatted strings conveniently. +* [BACKWARD INCOMPATIBILITY] `%s$%` and `%stri$%` now use `stri_sprintf` + instead of `base::sprintf`. + * TODO ... [NEW FEATURE] #434: `stri_datetime_format` and `stri_datetime_parse` is now also vectorised with respect to the `format` argument. diff --git a/R/sprintf.R b/R/sprintf.R index 7878e7c17..11701154e 100644 --- a/R/sprintf.R +++ b/R/sprintf.R @@ -35,8 +35,9 @@ #' Format Strings #' #' @description -#' A Unicode-aware replacement for the built-in \code{\link[base]{sprintf}} -#' function. Moreover, \code{stri_printf} prints formatted strings. +#' A Unicode-aware replacement for and enhancement of +#' the built-in \code{\link[base]{sprintf}} function. +#' Moreover, \code{stri_printf} prints formatted strings. #' #' @details #' Vectorized over \code{format} and all vectors passed via \code{...}. @@ -191,27 +192,34 @@ stri_printf <- function( if (!is.list(e2)) e2 <- list(e2) - # this is stringi, assure UTF-8 output and proper NA handling! - e1 <- stri_enc_toutf8(as.character(e1)) - if (length(e1) == 0) return(character(0)) - - for (i in seq_along(e2)) { - stopifnot(is.atomic(e2[[i]])) # factor is atomic - if (length(e2[[i]]) == 0) return(character(0)) - if (is.character(e2[[i]]) || is.factor(e2[[i]])) { - e2[[i]] <- stri_enc_toutf8(e2[[i]]) - } - } - - ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2)))) - # for the time being, let stri_paste determine NAs - # (it might be too greedy if there are unused strings) - which_na <- do.call(stri_paste, e2) - ret[is.na(which_na)] <- NA_character_ - - ret[is.na(e1)] <- NA_character_ - - ret + .Call(C_stri_sprintf, e1, e2, + na_string=NA_character_, + inf_string="Inf", + nan_string="NaN", + use_length=FALSE) + +# old version: based on base::sprintf +# # this is stringi, assure UTF-8 output and proper NA handling! +# e1 <- stri_enc_toutf8(as.character(e1)) +# if (length(e1) == 0) return(character(0)) +# +# for (i in seq_along(e2)) { +# stopifnot(is.atomic(e2[[i]])) # factor is atomic +# if (length(e2[[i]]) == 0) return(character(0)) +# if (is.character(e2[[i]]) || is.factor(e2[[i]])) { +# e2[[i]] <- stri_enc_toutf8(e2[[i]]) +# } +# } +# +# ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2)))) +# # for the time being, let stri_paste determine NAs +# # (it might be too greedy if there are unused strings) +# which_na <- do.call(stri_paste, e2) +# ret[is.na(which_na)] <- NA_character_ +# +# ret[is.na(e1)] <- NA_character_ +# +# ret } diff --git a/devel/sphinx/conf.py b/devel/sphinx/conf.py index d541b2f0d..ec8e3a1df 100644 --- a/devel/sphinx/conf.py +++ b/devel/sphinx/conf.py @@ -17,7 +17,7 @@ html_short_title = project # The full version, including alpha/beta/rc tags -version = '1.6.2' # TODO: automate +version = '1.6.3' # TODO: automate release = version github_project_url = "https://github.com/gagolews/stringi/" diff --git a/devel/tinytest/test-sprintf.R b/devel/tinytest/test-sprintf.R index e5dd1f1ae..9b56b7862 100644 --- a/devel/tinytest/test-sprintf.R +++ b/devel/tinytest/test-sprintf.R @@ -14,7 +14,8 @@ expect_error(suppressWarnings(stri_sprintf("%-3$s", "a"))) expect_identical(stri_sprintf("%%"), "%") expect_error(suppressWarnings(stri_sprintf("abc%"))) -stri_sprintf("%0000000000000000001$#- *0000002$.*003$f", 1.23456, -10, -3) +stringi::stri_sprintf("%0000000000000000001$#0+ *0000002$.*003$e", 1.23456, -12, 3) +stringi::stri_sprintf("%0000000000000000001$#0+ *0000002$.*003$e", 1.23456, 12, 3) # sprintf("%10.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_)) # sprintf("%010.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_)) @@ -35,10 +36,18 @@ stringi::stri_sprintf("%.*s", -1:8, x) stringi::stri_sprintf("%.*s", -1:8, x, use_length=TRUE) x <- "\u200b\u200b\u200b\u200b\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007Fabcd" -stringi::stri_sprintf("%.*s", -1:8, x) -stringi::stri_sprintf("%.*s", -1:8, x, use_length=TRUE) +stringi::stri_sprintf("%4.*s", -1:8, x) +stringi::stri_sprintf("%4.*s", -1:8, x, use_length=TRUE) +x <- "\u200b\u200b\u200b\u200b\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007Fabcd" +stringi::stri_sprintf("[%-16.*s]", -1:8, x) +stringi::stri_sprintf("[%16.*s]", -1:8, x) + +stringi::stri_sprintf("[%*.*s]", 1:8, 1:8, x) +stringi::stri_sprintf("[%-*.*s]", 1:8, 1:8, x) +stringi::stri_sprintf("[%*s]", 1:8, x) +stringi::stri_sprintf("[%-*s]", 1:8, x) ' sprintf("%2$s", 1, 2) # warning - unsused arg diff --git a/man/stri_sprintf.Rd b/man/stri_sprintf.Rd index 8779ed219..ee90ca677 100644 --- a/man/stri_sprintf.Rd +++ b/man/stri_sprintf.Rd @@ -50,8 +50,9 @@ text on the standard output or other connection/file. Hence, it returns The other functions return a character vector. } \description{ -A Unicode-aware replacement for the built-in \code{\link[base]{sprintf}} -function. Moreover, \code{stri_printf} prints formatted strings. +A Unicode-aware replacement for and enhancement of +the built-in \code{\link[base]{sprintf}} function. +Moreover, \code{stri_printf} prints formatted strings. } \details{ Vectorized over \code{format} and all vectors passed via \code{...}. diff --git a/src/stri_sprintf.cpp b/src/stri_sprintf.cpp index 33ae17eec..7be316f49 100644 --- a/src/stri_sprintf.cpp +++ b/src/stri_sprintf.cpp @@ -76,6 +76,17 @@ typedef enum { } StriSprintfType; +/** data types for sprintf + * + * @version 1.6.3 (Marek Gagolewski, 2021-05-20) + */ +typedef enum { + STRI_SPRINTF_FORMAT_STATUS_OK=0, + STRI_SPRINTF_FORMAT_STATUS_IS_NA, + STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING +} StriSprintfFormatStatus; + + /** * if delim found, stops right after delim, modifies jc in place * if delim not found, returns NA_INTEGER or throws an error @@ -421,8 +432,9 @@ class StriSprintfFormatSpec // followed by '$' and the call below will return NA_INTEGER which_datum = stri__atoi_to_delim( f, /*by reference*/jc, j0, j1, /*delimiter*/'$', false/*throw_error*/ - )-1/*0-based indexing*/; + ); // result can be NA_INTEGER; incorrect indexes will be caught by get* + if (which_datum != NA_INTEGER) which_datum--; /*0-based indexing*/ } // 2. optional flags [ +0#-] @@ -446,7 +458,8 @@ class StriSprintfFormatSpec if (f[jc] >= '0' && f[jc] <= '9') { which_width = stri__atoi_to_delim( f, /*by reference*/jc, j0, j1, /*delimiter*/'$' - )-1/*0-based indexing*/; + ); + if (which_width != NA_INTEGER) which_width--; /*0-based indexing*/ } min_width = data.getIntegerOrNA(which_width); } @@ -470,7 +483,8 @@ class StriSprintfFormatSpec if (f[jc] >= '0' && f[jc] <= '9') { which_precision = stri__atoi_to_delim( f, /*by reference*/jc, j0, j1, /*delimiter*/'$' - )-1/*0-based indexing*/; + ); + if (which_precision != NA_INTEGER) which_precision--; /*0-based indexing*/ } precision = data.getIntegerOrNA(which_precision); } @@ -546,45 +560,60 @@ class StriSprintfFormatSpec } - std::string formatDatum() + StriSprintfFormatStatus formatDatum(std::string& preformatted_datum) { - std::string preformatted_datum; - bool needs_padding; + StriSprintfFormatStatus status; if (type == STRI_SPRINTF_TYPE_INTEGER) { int datum = data.getIntegerOrNA(which_datum); - needs_padding = preformatDatum_doxX(preformatted_datum/*by reference*/, datum); + status = preformatDatum_doxX(preformatted_datum/*by reference*/, datum); } else if (type == STRI_SPRINTF_TYPE_DOUBLE) { double datum = data.getDoubleOrNA(which_datum); - needs_padding = preformatDatum_feEgGaA(preformatted_datum/*by reference*/, datum); + status = preformatDatum_feEgGaA(preformatted_datum/*by reference*/, datum); } else { // string const String8& datum = data.getStringOrNA(which_datum); - needs_padding = preformatDatum_s(preformatted_datum, datum); + status = preformatDatum_s(preformatted_datum, datum); } - if (!needs_padding) - return preformatted_datum; + if (status != STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING) + return status; - // now we need to pad with spaces from left or right up to min_width - // based on width or length (use_length) + if (min_width == NA_INTEGER) + return STRI_SPRINTF_FORMAT_STATUS_OK; - // btw: pad_from_right always add spaces - // btw: pad_zero "-00000" "+00000" " 00000" "0x0000" "0X0000" - // but not NA/Inf/... and only numerics, - // and this needs_padding no more (already dealt with) + STRI_ASSERT(min_width > 0); + R_len_t datum_size; + if (use_length) // number of code points + datum_size = stri__length_string(preformatted_datum.c_str(), + preformatted_datum.length()); + else + datum_size = stri__width_string(preformatted_datum.c_str(), + preformatted_datum.length()); -// if (use_length) width = str.countCodePoints(); -// else width = stri__width_string(str.c_str(), str.length()) + if (datum_size < min_width) { + // now we need to pad with spaces from left or right up to min_width + // based on width or length (use_length) - return preformatted_datum; // TODO + // btw: pad_from_right always add spaces + // btw: pad_zero "-00000" "+00000" " 00000" "0x0000" "0X0000" + // but not NA/Inf/... and only numerics, + // and this needs_padding no more (already dealt with) + + if (pad_from_right) + preformatted_datum.append(min_width-datum_size, ' '); + else + preformatted_datum.assign(std::string(min_width-datum_size, ' ') + preformatted_datum); + } + + return STRI_SPRINTF_FORMAT_STATUS_OK; } private: - bool preformatDatum_doxX(std::string& preformatted_datum, int datum) + StriSprintfFormatStatus preformatDatum_doxX(std::string& preformatted_datum, int datum) { STRI_ASSERT(type_spec != 'i'); // normalised i->d if (datum != NA_INTEGER) { @@ -603,11 +632,15 @@ class StriSprintfFormatSpec snprintf(buf.data(), bufsize, format_string.c_str(), datum); preformatted_datum.append(buf.data()); - return false; /* all in ASCII, padding done by std::snprintf */ + return STRI_SPRINTF_FORMAT_STATUS_OK; /* all in ASCII, padding done by std::snprintf */ } else { STRI_ASSERT(type_spec == 'd' || !sign_plus); STRI_ASSERT(type_spec == 'd' || !sign_space); + + if (na_string.isNA()) + return STRI_SPRINTF_FORMAT_STATUS_IS_NA; + if (sign_plus) { // glibc produces "+nan", but we will output " nan" instead preformatted_datum.push_back(' '); @@ -617,12 +650,12 @@ class StriSprintfFormatSpec // else no sign preformatted_datum.append(na_string.c_str()); - return true; /* might need padding (na_string can be fancy Unicode) */ + return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding (na_string can be fancy Unicode) */ } } - bool preformatDatum_feEgGaA(std::string& preformatted_datum, double datum) + StriSprintfFormatStatus preformatDatum_feEgGaA(std::string& preformatted_datum, double datum) { if (R_FINITE(datum)) { STRI_ASSERT(min_width == NA_INTEGER || min_width >= 0); @@ -640,7 +673,7 @@ class StriSprintfFormatSpec snprintf(buf.data(), bufsize, format_string.c_str(), datum); preformatted_datum.append(buf.data()); - return false; /* all in ASCII, padding done by std::snprintf */ + return STRI_SPRINTF_FORMAT_STATUS_OK; /* all in ASCII, padding done by std::snprintf */ } else { if (ISNA(datum) || ISNAN(datum)) { @@ -663,19 +696,28 @@ class StriSprintfFormatSpec } // alternate_output has no effect (use inf_string etc. instead) - if (ISNA(datum)) + if (ISNA(datum)) { + if (na_string.isNA()) + return STRI_SPRINTF_FORMAT_STATUS_IS_NA; preformatted_datum.append(na_string.c_str()); - else if (ISNAN(datum)) + } + else if (ISNAN(datum)) { + if (nan_string.isNA()) + return STRI_SPRINTF_FORMAT_STATUS_IS_NA; preformatted_datum.append(nan_string.c_str()); - else + } + else { + if (inf_string.isNA()) + return STRI_SPRINTF_FORMAT_STATUS_IS_NA; preformatted_datum.append(inf_string.c_str()); + } - return true; /* might need padding (na_string can be fancy Unicode) */ + return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding (na_string can be fancy Unicode) */ } } - bool preformatDatum_s(std::string& preformatted_datum, const String8& datum) + StriSprintfFormatStatus preformatDatum_s(std::string& preformatted_datum, const String8& datum) { STRI_ASSERT(!pad_zero); STRI_ASSERT(!sign_plus); @@ -684,7 +726,7 @@ class StriSprintfFormatSpec STRI_ASSERT(precision == NA_INTEGER || precision >= 0); if (!datum.isNA()) { - R_len_t datum_size = datum.length(); + R_len_t datum_size = datum.length(); // this is byte count if (precision != NA_INTEGER) { if (use_length) { // ha! output no more than code points @@ -698,10 +740,13 @@ class StriSprintfFormatSpec preformatted_datum.append(datum.c_str(), datum_size); } else { // isNA + if (na_string.isNA()) + return STRI_SPRINTF_FORMAT_STATUS_IS_NA; + preformatted_datum.append(na_string.c_str()); } - return true; /* might need padding */ + return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding */ } }; @@ -753,7 +798,9 @@ SEXP stri__sprintf_1( // debug: Rprintf("*** spec=%s\n", spec.toString().c_str()); // debug: buf.append(spec.toString()); - std::string formatted_datum = spec.formatDatum(); + std::string formatted_datum; + if (spec.formatDatum(formatted_datum) == STRI_SPRINTF_FORMAT_STATUS_IS_NA) + return NA_STRING; buf.append(formatted_datum); }