Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xxx regex cache #1770

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions benchmarks/regcomp-cache/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ match-many() {

declare -a REGEXES=()
for i in $(seq $num_pat); do
REGEXES[i]="$i?($i*)$i+" # last char is modified with ? then * and +
#REGEXES[i]="$i?($i*)$i+" # last char is modified with ? then * and +

# char classes are expensive to compile
REGEXES[i]="$i?($i*)$i+[a-zA-Z_]?" # last char is modified with ? then * and +
done

echo "${REGEXES[@]}"
Expand Down Expand Up @@ -68,8 +71,11 @@ compare() {
# with OSH
{ time $bin $0 match-many "$@"; } >$dir/osh-stdout.txt 2>$dir/osh-time.txt

# OSH without cache
{ time OILS_REGEX_CACHE_SIZE=0 $bin $0 match-many "$@"; } >$dir/osh-nocache-stdout.txt 2>$dir/osh-nocache-time.txt

# should have equal output except for version
diff $dir/*-stdout.txt || true
diff $dir/{bash,osh}-stdout.txt || true

# show timings
head $dir/*-time.txt
Expand Down
120 changes: 97 additions & 23 deletions cpp/libc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,102 @@
#include <fnmatch.h>
#include <glob.h>
#include <locale.h>
#include <regex.h>
#include <stdlib.h> // getenv()
#include <sys/ioctl.h>
#include <unistd.h> // gethostname()
#include <wchar.h>

#include <algorithm>

namespace libc {

RegexCache::CacheEntry::CacheEntry(BigStr* pat, int cflags) : pat_() {
int status = ::regcomp(&compiled_, pat->data_, cflags);
if (status != 0) {
char error_desc[50];
regerror(status, &compiled_, error_desc, 50);

char error_message[80];
snprintf(error_message, 80, "Invalid regex %s (%s)", pat->data_,
error_desc);

throw Alloc<ValueError>(StrFromC(error_message));
}

pat_ = static_cast<char*>(malloc(len(pat) + 1));
memcpy(pat_, pat->data_, len(pat) + 1);
pat_hash_ = hash(pat);
}

RegexCache::CacheEntry::~CacheEntry() {
DCHECK(pat_ != nullptr);
free(pat_);
regfree(&compiled_);
}

RegexCache::RegexCache(int capacity) : capacity_(capacity), access_list_() {
// Override if env var is set.
char* e = getenv("OILS_REGEX_CACHE_SIZE");
if (e) {
int result;
if (StringToInt(e, strlen(e), 10, &result)) {
capacity_ = result;
}
}
}

RegexCache::~RegexCache() {
for (auto& it : access_list_) {
delete it;
}
}

regex_t* RegexCache::regcomp(BigStr* pat, int cflags) {
RegexCache::CacheEntry* entry = TakeEntry(pat);
if (entry == nullptr) {
// Dealing with a new entry. Make space and compile.
MaybeEvict();
entry = new RegexCache::CacheEntry(pat, cflags);
}

SetMostRecent(entry);

return &entry->compiled_;
}

RegexCache::CacheEntry* RegexCache::TakeEntry(BigStr* pat) {
auto it = std::find_if(access_list_.begin(), access_list_.end(),
[pat](RegexCache::CacheEntry* entry) {
return hash(pat) == entry->pat_hash_ &&
strcmp(pat->data_, entry->pat_) == 0;
});
if (it == access_list_.end()) {
return nullptr;
}

RegexCache::CacheEntry* ret = *it;
access_list_.erase(it);
return ret;
}

void RegexCache::MaybeEvict() {
if (access_list_.size() < capacity_) {
return;
}

// Evict the least recently used entry.
if (access_list_.size()) {
delete *access_list_.begin();
access_list_.erase(access_list_.begin());
}
}

void RegexCache::SetMostRecent(RegexCache::CacheEntry* entry) {
access_list_.push_back(entry);
}

RegexCache gRegexCache(RegexCache::kDefaultSize);

BigStr* gethostname() {
// Note: Fixed issue #1656 - OS X and FreeBSD don't have HOST_NAME_MAX
// https://reviews.freebsd.org/D30062
Expand Down Expand Up @@ -108,28 +197,18 @@ List<BigStr*>* glob(BigStr* pat) {
List<int>* regex_search(BigStr* pattern, int cflags, BigStr* str, int eflags,
int pos) {
cflags |= REG_EXTENDED;
regex_t pat;
int status = regcomp(&pat, pattern->data_, cflags);
if (status != 0) {
char error_desc[50];
regerror(status, &pat, error_desc, 50);

char error_message[80];
snprintf(error_message, 80, "Invalid regex %s (%s)", pattern->data_,
error_desc);

throw Alloc<ValueError>(StrFromC(error_message));
}
regex_t* compiled = gRegexCache.regcomp(pattern, cflags);
DCHECK(compiled != nullptr);

int num_groups = pat.re_nsub + 1; // number of captures
int num_groups = compiled->re_nsub + 1; // number of captures

List<int>* indices = NewList<int>();
indices->reserve(num_groups * 2);

const char* s = str->data_;
regmatch_t* pmatch =
static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * num_groups));
bool match = regexec(&pat, s + pos, num_groups, pmatch, eflags) == 0;
bool match = regexec(compiled, s + pos, num_groups, pmatch, eflags) == 0;
if (match) {
int i;
for (i = 0; i < num_groups; i++) {
Expand All @@ -148,7 +227,6 @@ List<int>* regex_search(BigStr* pattern, int cflags, BigStr* str, int eflags,
}

free(pmatch);
regfree(&pat);

if (!match) {
return nullptr;
Expand All @@ -167,20 +245,16 @@ const int NMATCH = 2;
// Odd: This a Tuple2* not Tuple2 because it's Optional[Tuple2]!
Tuple2<int, int>* regex_first_group_match(BigStr* pattern, BigStr* str,
int pos) {
regex_t pat;
regmatch_t m[NMATCH];

// Could have been checked by regex_parse for [[ =~ ]], but not for glob
// patterns like ${foo/x*/y}.

if (regcomp(&pat, pattern->data_, REG_EXTENDED) != 0) {
throw Alloc<RuntimeError>(
StrFromC("Invalid regex syntax (func_regex_first_group_match)"));
}
regex_t* compiled = gRegexCache.regcomp(pattern, REG_EXTENDED);
DCHECK(compiled != nullptr);

// Match at offset 'pos'
int result = regexec(&pat, str->data_ + pos, NMATCH, m, 0 /*flags*/);
regfree(&pat);
int result = regexec(compiled, str->data_ + pos, NMATCH, m, 0 /*flags*/);

if (result != 0) {
return nullptr;
Expand Down
35 changes: 35 additions & 0 deletions cpp/libc.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
#ifndef LIBC_H
#define LIBC_H

#include <regex.h>
#include <stdlib.h>

#include <vector>

#include "mycpp/runtime.h"

namespace libc {
Expand Down Expand Up @@ -33,6 +36,38 @@ List<int>* regex_search(BigStr* pattern, int cflags, BigStr* str, int eflags,
int wcswidth(BigStr* str);
int get_terminal_width();

class RegexCache {
public:
static const int kDefaultSize = 100;

struct CacheEntry {
CacheEntry() = delete;
CacheEntry(const CacheEntry&) = delete;

CacheEntry(BigStr* pat, int cflags);
~CacheEntry();

char* pat_;
int pat_hash_;
regex_t compiled_;
};

RegexCache(int capacity);
~RegexCache();

regex_t* regcomp(BigStr* pat, int cflags);

private:
CacheEntry* TakeEntry(BigStr* pat);
void MaybeEvict();
void SetMostRecent(CacheEntry* entry);

size_t capacity_;
std::vector<CacheEntry*> access_list_;
};

extern RegexCache gRegexCache;

} // namespace libc

#endif // LIBC_H
9 changes: 4 additions & 5 deletions cpp/libc_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,13 @@ TEST for_test_coverage() {
}

void FindAll(const char* p, const char* s) {
regex_t pat;
regex_t* pat;

int cflags = REG_EXTENDED;
if (regcomp(&pat, p, cflags) != 0) {
if ((pat = libc::gRegexCache.regcomp(StrFromC(p), cflags)) == nullptr) {
FAIL();
}
int outlen = pat.re_nsub + 1; // number of captures
int outlen = pat->re_nsub + 1; // number of captures

// TODO: Could statically allocate 99, and assert that re_nsub is less than
// 99. Would speed up loops.
Expand All @@ -164,7 +164,7 @@ void FindAll(const char* p, const char* s) {
while (true) {
// Necessary so ^ doesn't match in the middle!
int eflags = cur_pos == 0 ? 0 : REG_NOTBOL;
bool match = regexec(&pat, s + cur_pos, outlen, pmatch, eflags) == 0;
bool match = regexec(pat, s + cur_pos, outlen, pmatch, eflags) == 0;

if (!match) {
break;
Expand All @@ -186,7 +186,6 @@ void FindAll(const char* p, const char* s) {
}

free(pmatch);
regfree(&pat);
}

// adjacent matches
Expand Down
6 changes: 6 additions & 0 deletions doc/ref/chap-special-var.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,12 @@ Result of regex evaluation `[[ $x =~ $pat ]]`.
Exit code of each element in a pipeline.


## Tuning

### OILS_REGEX_CACHE_SIZE

The maximum number of entries to keep in the regular expression compilation cache.

## Call Stack

## Tracing
Expand Down
1 change: 1 addition & 0 deletions doc/ref/toc-ysh.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ X [External Lang] BEGIN END when (awk)
[Oils VM] OILS_VERSION
OILS_GC_THRESHOLD OILS_GC_ON_EXIT
OILS_GC_STATS OILS_GC_STATS_FD
OILS_REGEX_CACHE_SIZE
X [Wok] _filename _line
X [Builtin Sub] _buffer
```
Expand Down
Loading