Skip to content

Commit

Permalink
#420: %stri$% now uses the (untested) stri_sprintf
Browse files Browse the repository at this point in the history
  • Loading branch information
gagolews committed May 22, 2021
1 parent 2cb0af5 commit 216ea1a
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 65 deletions.
11 changes: 8 additions & 3 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@
## 1.6.3-devel (2021-xx-yy)

* TODO ... [NEW FEATURE] #420: `stri_sprintf` (alias: `stri_string_format`)
is a Unicode-aware replacement for the base `sprintf`:
it adds a customised handling of `NA`s (on demand) and
computing field size based on code point width.
is a Unicode-aware replacement for and enhancement of the base `sprintf`:
it adds a customised handling of `NA`s (on demand),
computing field size based on code point width,
outputting substrings of at most given width,
variable width and precision (both at the same time), etc.
Moreover, `stri_printf` can be used to display formatted strings
conveniently.

* [BACKWARD INCOMPATIBILITY] `%s$%` and `%stri$%` now use `stri_sprintf`
instead of `base::sprintf`.

* TODO ... [NEW FEATURE] #434: `stri_datetime_format` and `stri_datetime_parse`
is now also vectorised with respect to the `format` argument.

Expand Down
54 changes: 31 additions & 23 deletions R/sprintf.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
#' Format Strings
#'
#' @description
#' A Unicode-aware replacement for the built-in \code{\link[base]{sprintf}}
#' function. Moreover, \code{stri_printf} prints formatted strings.
#' A Unicode-aware replacement for and enhancement of
#' the built-in \code{\link[base]{sprintf}} function.
#' Moreover, \code{stri_printf} prints formatted strings.
#'
#' @details
#' Vectorized over \code{format} and all vectors passed via \code{...}.
Expand Down Expand Up @@ -191,27 +192,34 @@ stri_printf <- function(
if (!is.list(e2))
e2 <- list(e2)

# this is stringi, assure UTF-8 output and proper NA handling!
e1 <- stri_enc_toutf8(as.character(e1))
if (length(e1) == 0) return(character(0))

for (i in seq_along(e2)) {
stopifnot(is.atomic(e2[[i]])) # factor is atomic
if (length(e2[[i]]) == 0) return(character(0))
if (is.character(e2[[i]]) || is.factor(e2[[i]])) {
e2[[i]] <- stri_enc_toutf8(e2[[i]])
}
}

ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2))))
# for the time being, let stri_paste determine NAs
# (it might be too greedy if there are unused strings)
which_na <- do.call(stri_paste, e2)
ret[is.na(which_na)] <- NA_character_

ret[is.na(e1)] <- NA_character_

ret
.Call(C_stri_sprintf, e1, e2,
na_string=NA_character_,
inf_string="Inf",
nan_string="NaN",
use_length=FALSE)

# old version: based on base::sprintf
# # this is stringi, assure UTF-8 output and proper NA handling!
# e1 <- stri_enc_toutf8(as.character(e1))
# if (length(e1) == 0) return(character(0))
#
# for (i in seq_along(e2)) {
# stopifnot(is.atomic(e2[[i]])) # factor is atomic
# if (length(e2[[i]]) == 0) return(character(0))
# if (is.character(e2[[i]]) || is.factor(e2[[i]])) {
# e2[[i]] <- stri_enc_toutf8(e2[[i]])
# }
# }
#
# ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2))))
# # for the time being, let stri_paste determine NAs
# # (it might be too greedy if there are unused strings)
# which_na <- do.call(stri_paste, e2)
# ret[is.na(which_na)] <- NA_character_
#
# ret[is.na(e1)] <- NA_character_
#
# ret
}


Expand Down
2 changes: 1 addition & 1 deletion devel/sphinx/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
html_short_title = project

# The full version, including alpha/beta/rc tags
version = '1.6.2' # TODO: automate
version = '1.6.3' # TODO: automate
release = version

github_project_url = "https://github.com/gagolews/stringi/"
Expand Down
15 changes: 12 additions & 3 deletions devel/tinytest/test-sprintf.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ expect_error(suppressWarnings(stri_sprintf("%-3$s", "a")))
expect_identical(stri_sprintf("%%"), "%")
expect_error(suppressWarnings(stri_sprintf("abc%")))

stri_sprintf("%0000000000000000001$#- *0000002$.*003$f", 1.23456, -10, -3)
stringi::stri_sprintf("%0000000000000000001$#0+ *0000002$.*003$e", 1.23456, -12, 3)
stringi::stri_sprintf("%0000000000000000001$#0+ *0000002$.*003$e", 1.23456, 12, 3)

# sprintf("%10.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_))
# sprintf("%010.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_))
Expand All @@ -35,10 +36,18 @@ stringi::stri_sprintf("%.*s", -1:8, x)
stringi::stri_sprintf("%.*s", -1:8, x, use_length=TRUE)

x <- "\u200b\u200b\u200b\u200b\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007Fabcd"
stringi::stri_sprintf("%.*s", -1:8, x)
stringi::stri_sprintf("%.*s", -1:8, x, use_length=TRUE)
stringi::stri_sprintf("%4.*s", -1:8, x)
stringi::stri_sprintf("%4.*s", -1:8, x, use_length=TRUE)


x <- "\u200b\u200b\u200b\u200b\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007Fabcd"
stringi::stri_sprintf("[%-16.*s]", -1:8, x)
stringi::stri_sprintf("[%16.*s]", -1:8, x)

stringi::stri_sprintf("[%*.*s]", 1:8, 1:8, x)
stringi::stri_sprintf("[%-*.*s]", 1:8, 1:8, x)
stringi::stri_sprintf("[%*s]", 1:8, x)
stringi::stri_sprintf("[%-*s]", 1:8, x)

'
sprintf("%2$s", 1, 2) # warning - unsused arg
Expand Down
5 changes: 3 additions & 2 deletions man/stri_sprintf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

113 changes: 80 additions & 33 deletions src/stri_sprintf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,17 @@ typedef enum {
} StriSprintfType;


/** data types for sprintf
*
* @version 1.6.3 (Marek Gagolewski, 2021-05-20)
*/
typedef enum {
STRI_SPRINTF_FORMAT_STATUS_OK=0,
STRI_SPRINTF_FORMAT_STATUS_IS_NA,
STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING
} StriSprintfFormatStatus;


/**
* if delim found, stops right after delim, modifies jc in place
* if delim not found, returns NA_INTEGER or throws an error
Expand Down Expand Up @@ -421,8 +432,9 @@ class StriSprintfFormatSpec
// followed by '$' and the call below will return NA_INTEGER
which_datum = stri__atoi_to_delim(
f, /*by reference*/jc, j0, j1, /*delimiter*/'$', false/*throw_error*/
)-1/*0-based indexing*/;
);
// result can be NA_INTEGER; incorrect indexes will be caught by get*
if (which_datum != NA_INTEGER) which_datum--; /*0-based indexing*/
}

// 2. optional flags [ +0#-]
Expand All @@ -446,7 +458,8 @@ class StriSprintfFormatSpec
if (f[jc] >= '0' && f[jc] <= '9') {
which_width = stri__atoi_to_delim(
f, /*by reference*/jc, j0, j1, /*delimiter*/'$'
)-1/*0-based indexing*/;
);
if (which_width != NA_INTEGER) which_width--; /*0-based indexing*/
}
min_width = data.getIntegerOrNA(which_width);
}
Expand All @@ -470,7 +483,8 @@ class StriSprintfFormatSpec
if (f[jc] >= '0' && f[jc] <= '9') {
which_precision = stri__atoi_to_delim(
f, /*by reference*/jc, j0, j1, /*delimiter*/'$'
)-1/*0-based indexing*/;
);
if (which_precision != NA_INTEGER) which_precision--; /*0-based indexing*/
}
precision = data.getIntegerOrNA(which_precision);
}
Expand Down Expand Up @@ -546,45 +560,60 @@ class StriSprintfFormatSpec
}


std::string formatDatum()
StriSprintfFormatStatus formatDatum(std::string& preformatted_datum)
{
std::string preformatted_datum;
bool needs_padding;
StriSprintfFormatStatus status;
if (type == STRI_SPRINTF_TYPE_INTEGER) {
int datum = data.getIntegerOrNA(which_datum);
needs_padding = preformatDatum_doxX(preformatted_datum/*by reference*/, datum);
status = preformatDatum_doxX(preformatted_datum/*by reference*/, datum);
}
else if (type == STRI_SPRINTF_TYPE_DOUBLE) {
double datum = data.getDoubleOrNA(which_datum);
needs_padding = preformatDatum_feEgGaA(preformatted_datum/*by reference*/, datum);
status = preformatDatum_feEgGaA(preformatted_datum/*by reference*/, datum);
}
else { // string
const String8& datum = data.getStringOrNA(which_datum);
needs_padding = preformatDatum_s(preformatted_datum, datum);
status = preformatDatum_s(preformatted_datum, datum);
}

if (!needs_padding)
return preformatted_datum;
if (status != STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING)
return status;

// now we need to pad with spaces from left or right up to min_width
// based on width or length (use_length)
if (min_width == NA_INTEGER)
return STRI_SPRINTF_FORMAT_STATUS_OK;

// btw: pad_from_right always add spaces
// btw: pad_zero "-00000" "+00000" " 00000" "0x0000" "0X0000"
// but not NA/Inf/... and only numerics,
// and this needs_padding no more (already dealt with)
STRI_ASSERT(min_width > 0);

R_len_t datum_size;
if (use_length) // number of code points
datum_size = stri__length_string(preformatted_datum.c_str(),
preformatted_datum.length());
else
datum_size = stri__width_string(preformatted_datum.c_str(),
preformatted_datum.length());

// if (use_length) width = str.countCodePoints();
// else width = stri__width_string(str.c_str(), str.length())
if (datum_size < min_width) {
// now we need to pad with spaces from left or right up to min_width
// based on width or length (use_length)

return preformatted_datum; // TODO
// btw: pad_from_right always add spaces
// btw: pad_zero "-00000" "+00000" " 00000" "0x0000" "0X0000"
// but not NA/Inf/... and only numerics,
// and this needs_padding no more (already dealt with)

if (pad_from_right)
preformatted_datum.append(min_width-datum_size, ' ');
else
preformatted_datum.assign(std::string(min_width-datum_size, ' ') + preformatted_datum);
}

return STRI_SPRINTF_FORMAT_STATUS_OK;
}


private:

bool preformatDatum_doxX(std::string& preformatted_datum, int datum)
StriSprintfFormatStatus preformatDatum_doxX(std::string& preformatted_datum, int datum)
{
STRI_ASSERT(type_spec != 'i'); // normalised i->d
if (datum != NA_INTEGER) {
Expand All @@ -603,11 +632,15 @@ class StriSprintfFormatSpec
snprintf(buf.data(), bufsize, format_string.c_str(), datum);
preformatted_datum.append(buf.data());

return false; /* all in ASCII, padding done by std::snprintf */
return STRI_SPRINTF_FORMAT_STATUS_OK; /* all in ASCII, padding done by std::snprintf */
}
else {
STRI_ASSERT(type_spec == 'd' || !sign_plus);
STRI_ASSERT(type_spec == 'd' || !sign_space);

if (na_string.isNA())
return STRI_SPRINTF_FORMAT_STATUS_IS_NA;

if (sign_plus) {
// glibc produces "+nan", but we will output " nan" instead
preformatted_datum.push_back(' ');
Expand All @@ -617,12 +650,12 @@ class StriSprintfFormatSpec
// else no sign

preformatted_datum.append(na_string.c_str());
return true; /* might need padding (na_string can be fancy Unicode) */
return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding (na_string can be fancy Unicode) */
}
}


bool preformatDatum_feEgGaA(std::string& preformatted_datum, double datum)
StriSprintfFormatStatus preformatDatum_feEgGaA(std::string& preformatted_datum, double datum)
{
if (R_FINITE(datum)) {
STRI_ASSERT(min_width == NA_INTEGER || min_width >= 0);
Expand All @@ -640,7 +673,7 @@ class StriSprintfFormatSpec
snprintf(buf.data(), bufsize, format_string.c_str(), datum);
preformatted_datum.append(buf.data());

return false; /* all in ASCII, padding done by std::snprintf */
return STRI_SPRINTF_FORMAT_STATUS_OK; /* all in ASCII, padding done by std::snprintf */
}
else {
if (ISNA(datum) || ISNAN(datum)) {
Expand All @@ -663,19 +696,28 @@ class StriSprintfFormatSpec
}

// alternate_output has no effect (use inf_string etc. instead)
if (ISNA(datum))
if (ISNA(datum)) {
if (na_string.isNA())
return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
preformatted_datum.append(na_string.c_str());
else if (ISNAN(datum))
}
else if (ISNAN(datum)) {
if (nan_string.isNA())
return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
preformatted_datum.append(nan_string.c_str());
else
}
else {
if (inf_string.isNA())
return STRI_SPRINTF_FORMAT_STATUS_IS_NA;
preformatted_datum.append(inf_string.c_str());
}

return true; /* might need padding (na_string can be fancy Unicode) */
return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding (na_string can be fancy Unicode) */
}
}


bool preformatDatum_s(std::string& preformatted_datum, const String8& datum)
StriSprintfFormatStatus preformatDatum_s(std::string& preformatted_datum, const String8& datum)
{
STRI_ASSERT(!pad_zero);
STRI_ASSERT(!sign_plus);
Expand All @@ -684,7 +726,7 @@ class StriSprintfFormatSpec
STRI_ASSERT(precision == NA_INTEGER || precision >= 0);

if (!datum.isNA()) {
R_len_t datum_size = datum.length();
R_len_t datum_size = datum.length(); // this is byte count
if (precision != NA_INTEGER) {
if (use_length) {
// ha! output no more than <precision> code points
Expand All @@ -698,10 +740,13 @@ class StriSprintfFormatSpec
preformatted_datum.append(datum.c_str(), datum_size);
}
else { // isNA
if (na_string.isNA())
return STRI_SPRINTF_FORMAT_STATUS_IS_NA;

preformatted_datum.append(na_string.c_str());
}

return true; /* might need padding */
return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding */
}
};

Expand Down Expand Up @@ -753,7 +798,9 @@ SEXP stri__sprintf_1(
// debug: Rprintf("*** spec=%s\n", spec.toString().c_str());
// debug: buf.append(spec.toString());

std::string formatted_datum = spec.formatDatum();
std::string formatted_datum;
if (spec.formatDatum(formatted_datum) == STRI_SPRINTF_FORMAT_STATUS_IS_NA)
return NA_STRING;

buf.append(formatted_datum);
}
Expand Down

0 comments on commit 216ea1a

Please sign in to comment.