From b44cc597cc5491f32da99de87fa8370f773b6fe4 Mon Sep 17 00:00:00 2001 From: gagolews Date: Tue, 18 May 2021 18:04:20 +1000 Subject: [PATCH] #420 stri_sprintf skeleton --- NAMESPACE | 3 ++ NEWS | 20 +++++++-- R/pad.R | 2 +- R/sprintf.R | 94 +++++++++++++++++++++++++++++++++++++-- man/operator_dollar.Rd | 11 ++--- man/stri_pad.Rd | 2 +- man/stri_sprintf.Rd | 79 ++++++++++++++++++++++++++++++++ src/stri_container_base.h | 4 ++ src/stri_container_utf8.h | 1 + src/stri_cpp.txt | 1 + src/stri_exports.h | 11 ++++- src/stri_length.cpp | 1 + src/stri_sprintf.cpp | 71 +++++++++++++++++++++++++++++ src/stri_stringi.cpp | 1 + 14 files changed, 285 insertions(+), 16 deletions(-) create mode 100644 man/stri_sprintf.Rd create mode 100644 src/stri_sprintf.cpp diff --git a/NAMESPACE b/NAMESPACE index fe89bf6b4..6d0d5601c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -169,6 +169,7 @@ export(stri_pad_left) export(stri_pad_right) export(stri_paste) export(stri_paste_list) +export(stri_printf) export(stri_rand_lipsum) export(stri_rand_shuffle) export(stri_rand_strings) @@ -206,12 +207,14 @@ export(stri_split_fixed) export(stri_split_lines) export(stri_split_lines1) export(stri_split_regex) +export(stri_sprintf) export(stri_startswith) export(stri_startswith_charclass) export(stri_startswith_coll) export(stri_startswith_fixed) export(stri_stats_general) export(stri_stats_latex) +export(stri_string_format) export(stri_sub) export(stri_sub_all) export(stri_sub_all_replace) diff --git a/NEWS b/NEWS index 71b8eccb3..c2881c31f 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,20 @@ # What Is New in *stringi* + +## 1.6.3-devel (2021-xx-yy) + +* TODO ... [NEW FEATURE] #420: `stri_sprintf` (alias: `stri_string_format`) + is a Unicode-aware replacement for the base `sprintf`: + it adds a customised handling of `NA`s (on demand) and + computing field size based on code point width. + Moreover, `stri_printf` can be used to display formatted strings + conveniently. + +* TODO ... [NEW FEATURE] #434: `stri_datetime_format` and `stri_datetime_parse` + is now also vectorised with respect to the `format` argument. + + ## 1.6.2 (2021-05-14) * [BACKWARD INCOMPATIBILITY] In `stri_enc_list()`, @@ -12,13 +26,13 @@ * [NEW FEATURE] #428: In `stri_flatten`, `na_empty=NA` now omits missing values. * [BUILD TIME] #431: Pre-4.9.0 GCC has `::max_align_t`, - but not `std::max_align_t`, added a (possible) workaround, see the INSTALL + but not `std::max_align_t`, added a (possible) workaround, see the `INSTALL` file. * [BUGFIX] #429: `stri_width()` misclassified the width of certain code points (including grave accent, Eszett, etc.); - General category Sk (Symbol, modifier) is no longer of width 0, - UCHAR_EAST_ASIAN_WIDTH of U_EA_AMBIGUOUS is no longer of width 2. + General category *Sk* (Symbol, modifier) is no longer of width 0, + `UCHAR_EAST_ASIAN_WIDTH` of `U_EA_AMBIGUOUS` is no longer of width 2. * [BUGFIX] #354: `ALTREP` `CHARSXP`s were not copied, and thus could have been garbage collected in the so-called meanwhile (with thanks to @jimhester). diff --git a/R/pad.R b/R/pad.R index 4b2a5f26d..1dbdfcc15 100644 --- a/R/pad.R +++ b/R/pad.R @@ -67,7 +67,7 @@ #' points be used instead of the total code point width #' (see \code{\link{stri_width}})? #' -#' @return Returns a character vector. +#' @return These functions return a character vector. #' #' @rdname stri_pad #' @examples diff --git a/R/sprintf.R b/R/sprintf.R index b31dff55a..f218f9319 100644 --- a/R/sprintf.R +++ b/R/sprintf.R @@ -31,12 +31,98 @@ ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#' @title +#' Format Strings +#' +#' @description +#' A Unicode-aware replacements for the built-in \code{\link[base]{sprintf}} +#' function. Moreover, \code{stri_printf} displays/writes formatted strings. +#' +#' @details +#' Vectorized over \code{format} and all vectors passed via \code{...}. +#' +#' \code{stri_string_format} is a synonym for \code{stri_sprintf}. +#' +#' Note that \code{stri_printf} treats missing values as \code{"NA"} strings +#' by default. +#' +#' Note that Unicode code points may have various widths when +#' printed on the console and that, by default, the function takes that +#' into account. By changing the state of the \code{use_length} +#' argument, this function act as if each code point was of width 1. +#' +#' @param format character vector of format strings +#' @param ... logical, integer, real, or character vectors (or objects +#' coercible to) +#' @param na_string single string to represent missing values; +#' if \code{NA}, missing values in \code{...} +#' result in the corresponding outputs be missing too; +#' use \code{"NA"} for compatibility with base R +#' @param inf_string single string to represent the (unsigned) infinity +#' @param na_string single string to represent the not-a-number +#' @param use_length single logical value; should the number of code +#' points be used when applying modifiers such as \code{\%20s} +#' instead of the total code point width (see \code{\link{stri_width}})? +#' @param file see \code{\link[base]{cat}} +#' @param sep see \code{\link[base]{cat}} +#' @param append see \code{\link[base]{cat}} +#' +#' @return +#' \code{stri_printf} is used for its side effect, which is printing +#' of text on the standard output or other connection. Hence, it returns +#' \code{invisible(NULL)}. +#' +#' The other function return a character vector. +#' +#' @rdname stri_sprintf +#' @examples +#' stri_sprintf("%10s=%.3f", "pi", pi) +#' +#' @export +stri_sprintf <- function( + format, ..., + na_string=NA_character_, + inf_string="Inf", + nan_string="NaN", + use_length=FALSE +) { + # force eval of ... here + .Call(C_stri_sprintf, format, list(...), + na_string, inf_string, nan_string, use_length) +} + + +#' @rdname stri_sprintf +#' @export +stri_string_format <- stri_sprintf + + +#' @export +stri_printf <- function( + format, ..., + file="", + sep="\n", + append=FALSE, + na_string="NA", + inf_string="Inf", + nan_string="NaN", + use_length=FALSE +) { + # force eval of ... here + str <- .Call(C_stri_sprintf, format, list(...), + na_string, inf_string, nan_string, use_length) + cat(str, file=file, sep=sep, append=append) +} + +### TODO: update + #' @title #' C-Style Formatting with sprintf as a Binary Operator +#' TODO: call stri_sprintf #' #' @description -#' Provides access to base R's \code{\link{sprintf}} in form of a binary +#' Provides access to base R's \code{\link[base]{sprintf}} in form of a binary #' operator in a way similar to Python's \code{\%} overloaded for strings. #' #' @@ -47,12 +133,12 @@ #' \code{e1 \%s$\% atomic_vector} is equivalent to #' \code{e1 \%s$\% list(atomic_vector)}. #' -#' Note that \code{\link{sprintf}} takes field width in bytes, +#' Note that \code{\link[base]{sprintf}} takes field width in bytes, #' not Unicode code points. See Examples for a workaround. #' #' -#' @param e1 format strings, see \code{\link{sprintf}} for syntax -#' @param e2 a list of atomic vectors to be passed to \code{\link{sprintf}} +#' @param e1 format strings, see \code{\link[base]{sprintf}} for syntax +#' @param e2 a list of atomic vectors to be passed to \code{\link[base]{sprintf}} #' or a single atomic vector #' #' @return diff --git a/man/operator_dollar.Rd b/man/operator_dollar.Rd index cb744cb63..caef38853 100644 --- a/man/operator_dollar.Rd +++ b/man/operator_dollar.Rd @@ -5,23 +5,24 @@ \alias{operator_dollar} \alias{oper_dollar} \alias{\%stri$\%} -\title{C-Style Formatting with sprintf as a Binary Operator} +\title{C-Style Formatting with sprintf as a Binary Operator +TODO: call stri_sprintf} \usage{ e1 \%s$\% e2 e1 \%stri$\% e2 } \arguments{ -\item{e1}{format strings, see \code{\link{sprintf}} for syntax} +\item{e1}{format strings, see \code{\link[base]{sprintf}} for syntax} -\item{e2}{a list of atomic vectors to be passed to \code{\link{sprintf}} +\item{e2}{a list of atomic vectors to be passed to \code{\link[base]{sprintf}} or a single atomic vector} } \value{ Returns a character vector } \description{ -Provides access to base R's \code{\link{sprintf}} in form of a binary +Provides access to base R's \code{\link[base]{sprintf}} in form of a binary operator in a way similar to Python's \code{\%} overloaded for strings. } \details{ @@ -30,7 +31,7 @@ Vectorized over \code{e1} and \code{e2}. \code{e1 \%s$\% atomic_vector} is equivalent to \code{e1 \%s$\% list(atomic_vector)}. -Note that \code{\link{sprintf}} takes field width in bytes, +Note that \code{\link[base]{sprintf}} takes field width in bytes, not Unicode code points. See Examples for a workaround. } \examples{ diff --git a/man/stri_pad.Rd b/man/stri_pad.Rd index f0a3d763a..185a27279 100644 --- a/man/stri_pad.Rd +++ b/man/stri_pad.Rd @@ -52,7 +52,7 @@ sides on which padding character is added (\code{left}, \code{right}, or \code{both})} } \value{ -Returns a character vector. +These functions return a character vector. } \description{ Add multiple \code{pad} characters at the given \code{side}(s) of each string diff --git a/man/stri_sprintf.Rd b/man/stri_sprintf.Rd new file mode 100644 index 000000000..ffced5f60 --- /dev/null +++ b/man/stri_sprintf.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sprintf.R +\name{stri_sprintf} +\alias{stri_sprintf} +\alias{stri_string_format} +\title{Format Strings} +\usage{ +stri_sprintf( + format, + ..., + na_string = NA_character_, + inf_string = "Inf", + nan_string = "NaN", + use_length = FALSE +) + +stri_string_format( + format, + ..., + na_string = NA_character_, + inf_string = "Inf", + nan_string = "NaN", + use_length = FALSE +) +} +\arguments{ +\item{format}{character vector of format strings} + +\item{...}{logical, integer, real, or character vectors (or objects +coercible to)} + +\item{na_string}{single string to represent the not-a-number} + +\item{inf_string}{single string to represent the (unsigned) infinity} + +\item{use_length}{single logical value; should the number of code +points be used when applying modifiers such as \code{\%20s} +instead of the total code point width (see \code{\link{stri_width}})?} + +\item{file}{see \code{\link[base]{cat}}} + +\item{sep}{see \code{\link[base]{cat}}} + +\item{append}{see \code{\link[base]{cat}}} +} +\value{ +\code{stri_printf} is used for its side effect, which is printing +of text on the standard output or other connection. Hence, it returns +\code{invisible(NULL)}. + +The other function return a character vector. +} +\description{ +A Unicode-aware replacements for the built-in \code{\link[base]{sprintf}} +function. Moreover, \code{stri_printf} displays/writes formatted strings. +} +\details{ +Vectorized over \code{format} and all vectors passed via \code{...}. + +\code{stri_string_format} is a synonym for \code{stri_sprintf}. + +Note that \code{stri_printf} treats missing values as \code{"NA"} strings +by default. + +Note that Unicode code points may have various widths when +printed on the console and that, by default, the function takes that +into account. By changing the state of the \code{use_length} +argument, this function act as if each code point was of width 1. +} +\examples{ +stri_sprintf("\%10s=\%.3f", "pi", pi) + +} +\author{ +\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors +} +\seealso{ +The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/} +} diff --git a/src/stri_container_base.h b/src/stri_container_base.h index 8bc8759fe..747cb3267 100644 --- a/src/stri_container_base.h +++ b/src/stri_container_base.h @@ -33,6 +33,10 @@ #ifndef __stri_container_base_h #define __stri_container_base_h +#include "stri_external.h" +#include "stri_exception.h" + + /** * Base class for all StriContainers diff --git a/src/stri_container_utf8.h b/src/stri_container_utf8.h index c782fea49..e8498d878 100644 --- a/src/stri_container_utf8.h +++ b/src/stri_container_utf8.h @@ -34,6 +34,7 @@ #define __stri_container_utf8_h #include "stri_container_base.h" +#include "stri_string8.h" /** diff --git a/src/stri_cpp.txt b/src/stri_cpp.txt index 17954c189..65bd0d836 100644 --- a/src/stri_cpp.txt +++ b/src/stri_cpp.txt @@ -65,6 +65,7 @@ stri_search_regex_replace.cpp \ stri_search_regex_split.cpp \ stri_search_regex_subset.cpp \ stri_sort.cpp \ +stri_sprintf.cpp \ stri_stats.cpp \ stri_stringi.cpp \ stri_sub.cpp \ diff --git a/src/stri_exports.h b/src/stri_exports.h index 8b443f1ae..167008d6f 100644 --- a/src/stri_exports.h +++ b/src/stri_exports.h @@ -36,7 +36,6 @@ #include #include - // compare.cpp: SEXP stri_cmp(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_le(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); @@ -149,7 +148,15 @@ SEXP stri_enc_isutf32be(SEXP str); // pad.cpp SEXP stri_pad(SEXP str, SEXP width, SEXP side=Rf_mkString("left"), - SEXP pad=Rf_mkString(" "), SEXP use_length=Rf_ScalarLogical(FALSE)); + SEXP pad=Rf_mkString(" "), SEXP use_length=Rf_ScalarLogical(FALSE)); + + +// sprintf.cpp +SEXP stri_sprintf(SEXP format, SEXP x, + SEXP na_string=Rf_ScalarString(NA_STRING), + SEXP inf_string=Rf_mkString("Inf"), + SEXP nan_string=Rf_mkString("NaN"), + SEXP use_length=Rf_ScalarLogical(FALSE)); // wrap.cpp SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent=Rf_ScalarInteger(2), diff --git a/src/stri_length.cpp b/src/stri_length.cpp index 44af43272..bcdf97d92 100644 --- a/src/stri_length.cpp +++ b/src/stri_length.cpp @@ -414,6 +414,7 @@ int stri__width_string(const char* str_cur_s, int str_cur_n) return cur_width; } + /** * Determine the width of strings * diff --git a/src/stri_sprintf.cpp b/src/stri_sprintf.cpp new file mode 100644 index 000000000..98b513b87 --- /dev/null +++ b/src/stri_sprintf.cpp @@ -0,0 +1,71 @@ +/* This file is part of the 'stringi' project. + * Copyright (c) 2013-2021, Marek Gagolewski + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "stri_stringi.h" +#include "stri_container_utf8.h" +#include "stri_container_integer.h" +#include "stri_container_logical.h" +#include "stri_container_double.h" +#include "stri_string8buf.h" +#include +#include + + +/** + * Format a string + * + * vectorized over format and each vector in x + * + * @param format character vector + * @param x list of vectors + * @param na_string single string, can be NA + * @param inf_string single string + * @param nan_string single string + * @param use_length single logical value + * @return character vector + * + * @version 1.6.3 (Marek Gagolewski, 2021-05-18) +*/ +SEXP stri_sprintf(SEXP format, SEXP x, SEXP na_string, + SEXP inf_string, SEXP nan_string, SEXP use_length) +{ + bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); + //PROTECT(format = stri_prepare_arg_string(format, "format")); + +// int stri__width_string(const char* str_cur_s, int str_cur_n) + // x + // na_string + // nan_string + // inf_string + + return R_NilValue; +} diff --git a/src/stri_stringi.cpp b/src/stri_stringi.cpp index e9e5bdc95..9b47885ed 100644 --- a/src/stri_stringi.cpp +++ b/src/stri_stringi.cpp @@ -191,6 +191,7 @@ const R_CallMethodDef cCallMethods[] = { STRI__MK_CALL("C_stri_split_lines", stri_split_lines, 2), STRI__MK_CALL("C_stri_split_lines1", stri_split_lines1, 1), STRI__MK_CALL("C_stri_split_regex", stri_split_regex, 7), + STRI__MK_CALL("C_stri_sprintf", stri_sprintf, 6), STRI__MK_CALL("C_stri_startswith_charclass", stri_startswith_charclass, 4), STRI__MK_CALL("C_stri_startswith_coll", stri_startswith_coll, 5), STRI__MK_CALL("C_stri_startswith_fixed", stri_startswith_fixed, 5),