Skip to content

Commit

Permalink
Add str_getcsv builtin support (#1096)
Browse files Browse the repository at this point in the history
Signed-off-by: Petr Shumilov <[email protected]>
  • Loading branch information
PetrShumilov authored Sep 6, 2024
1 parent b30b495 commit 6198e0d
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 45 deletions.
2 changes: 2 additions & 0 deletions builtin-functions/kphp-full/_functions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,8 @@ function rtrim ($s ::: string, $what ::: string = " \n\r\t\v\0") ::: string;
function xor_strings ($s ::: string, $t ::: string) ::: string;
function similar_text ($first ::: string, $second ::: string, float &$percent = TODO) ::: int;

function str_getcsv($str ::: string, string $delimiter ::: string = ",", string $enclosure ::: string = "\"", string $escape ::: string = "\\") ::: mixed[] | false;

function extension_loaded(string $extension): bool;

function ctype_alnum(mixed $text): bool;
Expand Down
94 changes: 49 additions & 45 deletions runtime/streams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
#include "runtime/allocator.h"
#include "runtime/critical_section.h"

constexpr int PHP_CSV_NO_ESCAPE = EOF;

static string::size_type max_wrapper_name_size = 0;

static array<const stream_functions *> wrappers;
Expand Down Expand Up @@ -505,43 +503,15 @@ static const char *fgetcsv_lookup_trailing_spaces(const char *ptr, size_t len) {
return ptr;
}


Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string delimiter, string enclosure, string escape) {
if (delimiter.empty()) {
php_warning("delimiter must be a character");
return false;
} else if (delimiter.size() > 1) {
php_warning("delimiter must be a single character");
}
if (enclosure.empty()) {
php_warning("enclosure must be a character");
return false;
} else if (enclosure.size() > 1) {
php_warning("enclosure must be a single character");
}
int escape_char = PHP_CSV_NO_ESCAPE;
if (!escape.empty()) {
escape_char = static_cast<int>(escape[0]);
} else if (escape.size() > 1) {
php_warning("escape_char must be a single character");
}
char delimiter_char = delimiter[0];
char enclosure_char = enclosure[0];
if (length < 0) {
php_warning("Length parameter may not be negative");
return false;
} else if (length == 0) {
length = -1;
}
Optional<string> buf_optional = length < 0 ? f$fgets(stream) : f$fgets(stream, length + 1);
if (!buf_optional.has_value()) {
return false;
}
string buffer = buf_optional.val();
// Common csv-parsing functionality for
// * fgetcsv
// * str_getcsv
// The function is similar to `php_fgetcsv` function from https://github.com/php/php-src/blob/master/ext/standard/file.c
Optional<array<mixed>> getcsv(const Stream &stream, string buffer, char delimiter, char enclosure, char escape) {
array<mixed> answer;
int current_id = 0;
string_buffer tmp_buffer;
// this part is imported from https://github.com/php/php-src/blob/master/ext/standard/file.c, function php_fgetcsv
// Following part is imported from `php_fgetcsv`
char const *buf = buffer.c_str();
char const *bptr = buf;
size_t buf_len = buffer.size();
Expand All @@ -557,10 +527,10 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de
inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mblen(bptr, limit - bptr)) : 0);
if (inc_len == 1) {
char const *tmp = bptr;
while ((*tmp != delimiter_char) && isspace((int)*(unsigned char *)tmp)) {
while ((*tmp != delimiter) && isspace((int)*(unsigned char *)tmp)) {
tmp++;
}
if (*tmp == enclosure_char) {
if (*tmp == enclosure) {
bptr = tmp;
}
}
Expand All @@ -571,7 +541,7 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de
}
first_field = false;
/* 2. Read field, leaving bptr pointing at start of next field */
if (inc_len != 0 && *bptr == enclosure_char) {
if (inc_len != 0 && *bptr == enclosure) {
int state = 0;

bptr++; /* move on to first character in field */
Expand Down Expand Up @@ -641,7 +611,7 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de
state = 0;
break;
case 2: /* embedded enclosure ? let's check it */
if (*bptr != enclosure_char) {
if (*bptr != enclosure) {
/* real enclosure */
tmp_buffer.append(hunk_begin, static_cast<size_t>(bptr - hunk_begin - 1));
hunk_begin = bptr;
Expand All @@ -653,9 +623,9 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de
state = 0;
break;
default:
if (*bptr == enclosure_char) {
if (*bptr == enclosure) {
state = 2;
} else if (escape_char != PHP_CSV_NO_ESCAPE && *bptr == escape_char) {
} else if (escape != PHP_CSV_NO_ESCAPE && *bptr == escape) {
state = 1;
}
bptr++;
Expand Down Expand Up @@ -697,7 +667,7 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de
inc_len = 1;
/* fallthrough */
case 1:
if (*bptr == delimiter_char) {
if (*bptr == delimiter) {
goto quit_loop_3;
}
break;
Expand Down Expand Up @@ -725,7 +695,7 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de
inc_len = 1;
/* fallthrough */
case 1:
if (*bptr == delimiter_char) {
if (*bptr == delimiter) {
goto quit_loop_4;
}
break;
Expand All @@ -740,7 +710,7 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de

char const *comp_end = (char *)fgetcsv_lookup_trailing_spaces(tmp_buffer.c_str(), tmp_buffer.size());
tmp_buffer.set_pos(comp_end - tmp_buffer.c_str());
if (*bptr == delimiter_char) {
if (*bptr == delimiter) {
bptr++;
}
}
Expand All @@ -753,6 +723,40 @@ Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string de
return answer;
}

Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length, string delimiter, string enclosure, string escape) {
if (delimiter.empty()) {
php_warning("delimiter must be a character");
return false;
} else if (delimiter.size() > 1) {
php_warning("delimiter must be a single character");
}
if (enclosure.empty()) {
php_warning("enclosure must be a character");
return false;
} else if (enclosure.size() > 1) {
php_warning("enclosure must be a single character");
}
int escape_char = PHP_CSV_NO_ESCAPE;
if (!escape.empty()) {
escape_char = static_cast<int>(escape[0]);
} else if (escape.size() > 1) {
php_warning("escape_char must be a single character");
}
char delimiter_char = delimiter[0];
char enclosure_char = enclosure[0];
if (length < 0) {
php_warning("Length parameter may not be negative");
return false;
} else if (length == 0) {
length = -1;
}
Optional<string> buf_optional = length < 0 ? f$fgets(stream) : f$fgets(stream, length + 1);
if (!buf_optional.has_value()) {
return false;
}
return getcsv(stream, buf_optional.val(), delimiter_char, enclosure_char, escape_char);
}

Optional<string> f$file_get_contents(const string &stream) {
STREAM_FUNCTION_BODY(file_get_contents, false)(url);
}
Expand Down
3 changes: 3 additions & 0 deletions runtime/streams.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ constexpr int64_t STREAM_SET_READ_BUFFER_OPTION = 2;

constexpr int64_t FILE_APPEND = 1;

constexpr int PHP_CSV_NO_ESCAPE = EOF;

struct stream_functions {
string name;
Expand Down Expand Up @@ -89,6 +90,8 @@ Optional<int64_t> f$vfprintf(const Stream &stream, const string &format, const a
Optional<int64_t> f$fputcsv(const Stream &stream, const array<mixed> &fields, string delimiter = string(",", 1),
string enclosure = string("\"", 1), string escape_char = string("\\", 1));

Optional<array<mixed>> getcsv(const Stream &stream, string buffer, char delimiter, char enclosure, char escape);

Optional<array<mixed>> f$fgetcsv(const Stream &stream, int64_t length = 0, string delimiter = string(",", 1),
string enclosure = string("\"", 1), string escape_char = string("\\", 1));

Expand Down
40 changes: 40 additions & 0 deletions runtime/string_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
#include "runtime/context/runtime-context.h"
#include "runtime/interface.h"

// For "f$str_getcsv" support
#include "runtime/streams.h"

const string COLON(",", 1);
const string CP1251("cp1251");
const string DOT(".", 1);
Expand Down Expand Up @@ -2950,3 +2953,40 @@ string str_concat(str_concat_arg s1, str_concat_arg s2, str_concat_arg s3, str_c
auto new_size = s1.size + s2.size + s3.size + s4.size + s5.size;
return string(new_size, true).append_unsafe(s1.as_tmp_string()).append_unsafe(s2.as_tmp_string()).append_unsafe(s3.as_tmp_string()).append_unsafe(s4.as_tmp_string()).append_unsafe(s5.as_tmp_string()).finish_append();
}

// Based on `getcsv` from `streams`
Optional<array<mixed>> f$str_getcsv(const string &str, const string &delimiter, const string &enclosure, const string &escape) {
char delimiter_char = ',';
char enclosure_char = '"';
char escape_char = PHP_CSV_NO_ESCAPE;
/*
* By PHP Manual: delimiter, enclosure, escape -- one single-byte character only
* We make it a warning
* Since PHP 8.3.11 it should return false
*/
const auto del_size = delimiter.size();
if (del_size > 1) {
php_warning("Delimiter must be a single character");
}
if (del_size != 0) {
delimiter_char = delimiter[0];
}

const auto enc_size = enclosure.size();
if (enc_size > 1) {
php_warning("Enclosure must be a single character");
}
if (enc_size != 0) {
enclosure_char = enclosure[0];
}

const auto esc_size = escape.size();
if (esc_size > 1) {
php_warning("Escape must be a single character");
}
if (esc_size != 0) {
escape_char = escape[0];
}

return getcsv(mixed() /* null */, str, delimiter_char, enclosure_char, escape_char);
}
3 changes: 3 additions & 0 deletions runtime/string_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ string f$vsprintf(const string &format, const array<mixed> &args);

string f$wordwrap(const string &str, int64_t width = 75, const string &brk = NEW_LINE, bool cut = false);

Optional<array<mixed>> f$str_getcsv(const string &s, const string &delimiter = string(1, ','),
const string &enclosure = string(1, '\"'), const string &escape = string(1, '\\'));

/*
*
* IMPLEMENTATION
Expand Down
27 changes: 27 additions & 0 deletions tests/phpt/string_functions/011_str_getcsv.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
@ok
<?php

$s1 = <<<_STR
"a","b","c"
_STR;

$s2 = <<<_STR
*a*,*b*,*\*c*
_STR;


// In php empty delimiter and enclosure args leads to the same behavior as omitted args
var_dump(str_getcsv($s1));
var_dump(str_getcsv($s1, ""));

var_dump(str_getcsv($s1, ","));
var_dump(str_getcsv($s1, ",", ""));

// But empty escape symbol has same semantics as one backslash ("\")
// 1 <=> 2
// not 1 <=> 3
var_dump(str_getcsv($s2, ",", "*")); // 1
var_dump(str_getcsv($s2, ",", "*", "\\")); // 2
var_dump(str_getcsv($s2, ",", "*", "")); // 3


3 changes: 3 additions & 0 deletions tests/zend-test-list
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,9 @@ ext/standard/tests/strings/vsprintf_basic8.phpt
ext/standard/tests/strings/vsprintf_basic9.phpt
ext/standard/tests/strings/wordwrap_basic.phpt
ext/standard/tests/strings/wordwrap_variation5.phpt
ext/standard/tests/strings/str_getcsv_001.phpt
ext/standard/tests/strings/str_getcsv_002.phpt
ext/standard/tests/strings/bug55674.phpt
ext/standard/tests/url/base64_decode_basic_001.phpt
ext/standard/tests/url/base64_decode_basic_002.phpt
ext/standard/tests/url/base64_encode_basic_001.phpt
Expand Down

0 comments on commit 6198e0d

Please sign in to comment.