Skip to content

Commit

Permalink
Enhancement to allow the user to change the RE syntax by setting PROC…
Browse files Browse the repository at this point in the history
…INFO["re_syntax"]

Default changed from RE_SYNTAX_GNU_AWK to RE_SYNTAX_POSIX_AWK to align closer to Gawk
  • Loading branch information
James Parkinson committed Jul 18, 2021
1 parent dde3f5a commit 35bf648
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 66 deletions.
28 changes: 23 additions & 5 deletions lib/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ _awka_init_procinfo( a_VAR *procinfo )
{
a_VAR *ret, *tmp = NULL;
char *tmpstr = NULL;
int slen = 0;

malloc( &tmpstr, 70 );
awka_varinit(tmp);
Expand All @@ -227,7 +228,9 @@ _awka_init_procinfo( a_VAR *procinfo )
/* ppid */
/* uid */

/* identifiers - a subarray of identifiers*/
/* identifiers - NOT a subarray of identifiers like Gawk, but
* the key is "identifiers,<name>"
*/
/* builtins */
for (int i=0; i<A_BI_VARARG_SIZE; i++ ) {
sprintf(tmpstr, "identifiers,%s", _a_bi_vararg[i].name);
Expand Down Expand Up @@ -269,24 +272,39 @@ _awka_init_procinfo( a_VAR *procinfo )
ret->allc = malloc( &ret->ptr, 25 );
ret->slen = 24;
strcpy(ret->ptr, "%a %b %d %H:%M:%S %Z %Y");
ret->ptr[24] = '\0';

/* version */
awka_strcpy(tmp, "version");
ret = awka_arraysearch1( procinfo, tmp, a_ARR_CREATE, 0 );
awka_strcpy(ret, "version");
ret->type = a_VARSTR;
ret->allc = malloc( &ret->ptr, (int) strlen(patch_str) + 1 );
ret->slen = (int) strlen(patch_str);
slen = strlen(patch_str);
ret->allc = malloc( &ret->ptr, slen + 1 );
ret->slen = slen;
strcpy(ret->ptr, patch_str);
ret->ptr[slen] = '\0';

/* awkfile (unique to awka) */
awka_strcpy(tmp, "awkfile");
ret = awka_arraysearch1( procinfo, tmp, a_ARR_CREATE, 0 );
awka_strcpy(ret, "awkfile");
ret->type = a_VARSTR;
ret->allc = malloc( &ret->ptr, (int) strlen(awk_str) + 1 );
ret->slen = (int) strlen(awk_str);
slen = strlen(awk_str);
ret->allc = malloc( &ret->ptr, slen + 1 );
ret->slen = slen;
strcpy(ret->ptr, awk_str);
ret->ptr[slen] = '\0';

/* resyntax (unique to awka) */
awka_strcpy(tmp, "re_syntax");
ret = awka_arraysearch1( procinfo, tmp, a_ARR_CREATE, 0 );
awka_strcpy(ret, "re_syntax");
ret->type = a_VARSTR;
ret->allc = malloc( &ret->ptr, 25 + 1 );
ret->slen = 17;
strcpy(ret->ptr, "RE_SYNTAX_GNU_AWK");
ret->ptr[17] = '\0';

free(tmpstr);
}
Expand Down
81 changes: 78 additions & 3 deletions lib/rexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,81 @@ struct regexp_list_struct {
regexp_list **re_list = NULL;
#define RE_LIST_SIZE 17

long re_syntax = RE_SYNTAX_POSIX_AWK;

int HS_AWK = 0, HS_GNU_AWK, HS_EMACS, HS_POSIX_AWK, HS_POSIX_EGREP,
HS_POSIX_BASIC, HS_POSIX_MIN_BASIC, HS_POSIX_EXTENDED,
HS_POSIX_MIN_EXT, HS_GREP, HS_EGREP, HS_ED, HS_SED;

void
_init_hashes()
{
if (HS_AWK == 0)
{
HS_AWK = _awka_hashstr("RE_SYNTAX_AWK", 13);
HS_GNU_AWK = _awka_hashstr("RE_SYNTAX_GNU_AWK", 17);
HS_EMACS = _awka_hashstr("RE_SYNTAX_EMACS", 15);
HS_POSIX_AWK = _awka_hashstr("RE_SYNTAX_POSIX_AWK", 19);
HS_POSIX_EGREP = _awka_hashstr("RE_SYNTAX_POSIX_EGREP", 21);
HS_POSIX_BASIC = _awka_hashstr("RE_SYNTAX_POSIX_BASIC", 21);
HS_POSIX_MIN_BASIC = _awka_hashstr("RE_SYNTAX_POSIX_MINIMAL_BASIC", 29);
HS_POSIX_EXTENDED = _awka_hashstr("RE_SYNTAX_POSIX_EXTENDED", 24);
HS_POSIX_MIN_EXT = _awka_hashstr("RE_SYNTAX_POSIX_MINIMAL_EXTENDED", 32);
HS_GREP = _awka_hashstr("RE_SYNTAX_GREP", 14);
HS_EGREP = _awka_hashstr("RE_SYNTAX_EGREP", 15);
HS_ED = _awka_hashstr("RE_SYNTAX_ED", 12);
HS_SED = _awka_hashstr("RE_SYNTAX_SED", 13);
}
}

/*
* Setting the Syntax only works ONCE!!
* The syntax is set in main() (and compiled
* into the regex) and not set on every
* call to match/split/..
*
* The last call to set the syntax sets the
* syntax that will be used.
*/
void
_awka_set_re_syntax(char *syn)
{
unsigned hval;
//long sval = RE_SYNTAX_AWK;
long sval = RE_SYNTAX_POSIX_AWK;
//long sval = RE_SYNTAX_GNU_AWK;

_init_hashes();
hval = _awka_hashstr(syn, strlen(syn));

if (hval == HS_AWK)
sval = RE_SYNTAX_AWK;
else if (hval == HS_GNU_AWK)
sval = RE_SYNTAX_GNU_AWK;
else if (hval == HS_EMACS)
sval = RE_SYNTAX_EMACS;
else if (hval == HS_GREP)
sval = RE_SYNTAX_GREP;
else if (hval == HS_EGREP)
sval = RE_SYNTAX_EGREP;
else if (hval == HS_ED ||
hval == HS_SED ||
hval == HS_POSIX_BASIC)
sval = RE_SYNTAX_POSIX_BASIC;
else if (hval == HS_POSIX_MIN_BASIC)
sval = RE_SYNTAX_POSIX_MINIMAL_BASIC;
else if (hval == HS_POSIX_AWK)
sval = RE_SYNTAX_POSIX_AWK;
else if (hval == HS_POSIX_EGREP)
sval = RE_SYNTAX_POSIX_EGREP;
else if (hval == HS_POSIX_EXTENDED)
sval = RE_SYNTAX_POSIX_EXTENDED;
else if (hval == HS_POSIX_MIN_EXT)
sval = RE_SYNTAX_POSIX_MINIMAL_EXTENDED;

re_syntax = sval;
}

static char *
_awka_fixescapes(char *str, unsigned int len)
{
Expand Down Expand Up @@ -142,7 +217,7 @@ awka_re_isexactstr(char *str, int len, unsigned can_be_null)
re_list[idx] = list; \
} \
if (!(list->re_fs = awka_re_isexactstr(list->str, len, FALSE))) \
list->re_fs = awka_regcomp(list->str, FALSE); \
list->re_fs = awka_regcomp(list->str, FALSE, re_syntax); \
if (!list->re_fs) \
awka_error("fail to compile regular expression '%s'\n",list->str); \
list->re_fs->dfa = (void *) dfacomp(list->str, strlen(list->str), TRUE); \
Expand All @@ -156,7 +231,7 @@ awka_re_isexactstr(char *str, int len, unsigned can_be_null)
re_list[idx] = list; \
} \
if (!(list->re_nofs = awka_re_isexactstr(list->str, len, FALSE))) \
list->re_nofs = awka_regcomp(list->str, FALSE); \
list->re_nofs = awka_regcomp(list->str, FALSE, re_syntax); \
if (!list->re_nofs) \
awka_error("fail to compile regular expression '%s'\n",list->str); \
list->re_nofs->dfa = (void *) dfacomp(list->str, strlen(list->str), TRUE); \
Expand All @@ -169,7 +244,7 @@ awka_re_isexactstr(char *str, int len, unsigned can_be_null)
re_list[idx] = list; \
} \
if (!(list->re_gsub = awka_re_isexactstr(list->str, len, TRUE))) \
list->re_gsub = awka_regcomp(list->str, TRUE); \
list->re_gsub = awka_regcomp(list->str, TRUE, re_syntax); \
if (!list->re_gsub) \
awka_error("fail to compile regular expression '%s'\n",list->str); \
list->re_gsub->dfa = (void *) dfacomp(list->str, strlen(list->str), TRUE); \
Expand Down
7 changes: 5 additions & 2 deletions lib/var.c
Original file line number Diff line number Diff line change
Expand Up @@ -1101,13 +1101,16 @@ awka_strncpy(a_VAR *v, char *s, int _slen)
tmpv->slen = 2;
strncpy( tmpv->ptr, "FS", 3 );
}

if ( v == a_bivar[a_FIELDWIDTHS] || v == a_bivar[a_SAVEWIDTHS] )
else if ( v == a_bivar[a_FIELDWIDTHS] || v == a_bivar[a_SAVEWIDTHS] )
{
tmpv = awka_arraysearch1( a_bivar[a_PROCINFO], awka_tmp_str2var("FS"), a_ARR_CREATE, 0 );
tmpv->slen = 11; /* sized to 12 bytes originally in init.c */
strncpy( tmpv->ptr, "FIELDWIDTHS", 12 );
}
else if ( v->type == a_VARSTR && strncmp(v->ptr, "RE_SYNTAX_", 10) == 0)
{
_awka_set_re_syntax(s);
}

return v->ptr;
}
Expand Down
2 changes: 2 additions & 0 deletions lib/var.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ extern char fs_or_fw, _awka_setdol0_len;
extern char _rebuild0, _rebuildn, _rebuild0_now;
#endif

extern void _awka_set_re_syntax(char *);

static INLINE a_VAR * awka_NFget();
#define _awka_set_FW(v) \
if ((v) == a_bivar[a_FS]) { \
Expand Down
65 changes: 33 additions & 32 deletions regexp/regex.c
Original file line number Diff line number Diff line change
Expand Up @@ -604,10 +604,10 @@ extract_number_and_incr (destination, source)
static int debug;

# define DEBUG_STATEMENT(e) e
# define DEBUG_PRINT1(x) if (debug) printf (x)
# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
# define DEBUG_PRINT1(x) if (debug) printf ((x))
# define DEBUG_PRINT2(x1, x2) if (debug) printf ((x1), (x2))
# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf ((x1), (x2), (x3))
# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf ((x1), (x2), (x3), (x4))
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
if (debug) print_partial_compiled_pattern (s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
Expand Down Expand Up @@ -667,7 +667,7 @@ print_partial_compiled_pattern (start, end)
/* Loop over pattern commands. */
while (p < pend)
{
printf ("%d:\t", p - start);
printf ("%d:\t", (int) (p - start));

switch ((re_opcode_t) *p++)
{
Expand All @@ -688,16 +688,16 @@ print_partial_compiled_pattern (start, end)

case start_memory:
mcnt = *p++;
printf ("/start_memory/%d/%d", mcnt, *p++);
printf ("/start_memory/%d/%d", mcnt, (int) (*p++));
break;

case stop_memory:
mcnt = *p++;
printf ("/stop_memory/%d/%d", mcnt, *p++);
printf ("/stop_memory/%d/%d", mcnt, (int) (*p++));
break;

case duplicate:
printf ("/duplicate/%d", *p++);
printf ("/duplicate/%d", (int) (*p++));
break;

case anychar:
Expand Down Expand Up @@ -757,17 +757,17 @@ print_partial_compiled_pattern (start, end)

case on_failure_jump:
extract_number_and_incr (&mcnt, &p);
printf ("/on_failure_jump to %d", p + mcnt - start);
printf ("/on_failure_jump to %d", (int) (p + mcnt - start));
break;

case on_failure_keep_string_jump:
extract_number_and_incr (&mcnt, &p);
printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
printf ("/on_failure_keep_string_jump to %d", (int) (p + mcnt - start));
break;

case dummy_failure_jump:
extract_number_and_incr (&mcnt, &p);
printf ("/dummy_failure_jump to %d", p + mcnt - start);
printf ("/dummy_failure_jump to %d", (int) (p + mcnt - start));
break;

case push_dummy_failure:
Expand All @@ -776,43 +776,43 @@ print_partial_compiled_pattern (start, end)

case maybe_pop_jump:
extract_number_and_incr (&mcnt, &p);
printf ("/maybe_pop_jump to %d", p + mcnt - start);
printf ("/maybe_pop_jump to %d", (int) (p + mcnt - start));
break;

case pop_failure_jump:
extract_number_and_incr (&mcnt, &p);
printf ("/pop_failure_jump to %d", p + mcnt - start);
printf ("/pop_failure_jump to %d", (int) (p + mcnt - start));
break;

case jump_past_alt:
extract_number_and_incr (&mcnt, &p);
printf ("/jump_past_alt to %d", p + mcnt - start);
printf ("/jump_past_alt to %d", (int) (p + mcnt - start));
break;

case jump:
extract_number_and_incr (&mcnt, &p);
printf ("/jump to %d", p + mcnt - start);
printf ("/jump to %d", (int) (p + mcnt - start));
break;

case succeed_n:
extract_number_and_incr (&mcnt, &p);
p1 = p + mcnt;
extract_number_and_incr (&mcnt2, &p);
printf ("/succeed_n to %d, %d times", p1 - start, mcnt2);
printf ("/succeed_n to %d, %d times", (int) (p1 - start), mcnt2);
break;

case jump_n:
extract_number_and_incr (&mcnt, &p);
p1 = p + mcnt;
extract_number_and_incr (&mcnt2, &p);
printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
printf ("/jump_n to %d, %d times", (int) (p1 - start), mcnt2);
break;

case set_number_at:
extract_number_and_incr (&mcnt, &p);
p1 = p + mcnt;
extract_number_and_incr (&mcnt2, &p);
printf ("/set_number_at location %d to %d", p1 - start, mcnt2);
printf ("/set_number_at location %d to %d", (int) (p1 - start), mcnt2);
break;

case wordbound:
Expand Down Expand Up @@ -873,13 +873,13 @@ print_partial_compiled_pattern (start, end)
break;

default:
printf ("?%d", *(p-1));
printf ("?%d", (int) (*(p-1)));
}

putchar ('\n');
}

printf ("%d:\tend of pattern.\n", p - start);
printf ("%d:\tend of pattern.\n", (int) (p - start));
}


Expand All @@ -899,14 +899,14 @@ print_compiled_pattern (bufp)
print_fastmap (bufp->fastmap);
}

printf ("re_nsub: %d\t", bufp->re_nsub);
printf ("regs_alloc: %d\t", bufp->regs_allocated);
printf ("can_be_null: %d\t", bufp->can_be_null);
printf ("newline_anchor: %d\n", bufp->newline_anchor);
printf ("no_sub: %d\t", bufp->no_sub);
printf ("not_bol: %d\t", bufp->not_bol);
printf ("not_eol: %d\t", bufp->not_eol);
printf ("syntax: %lx\n", bufp->syntax);
printf ("re_nsub: %d\t", (int) (bufp->re_nsub));
printf ("regs_alloc: %d\t", (int) (bufp->regs_allocated));
printf ("can_be_null: %d\t", (int) (bufp->can_be_null));
printf ("newline_anchor: %d\n", (int) (bufp->newline_anchor));
printf ("no_sub: %d\t", (int) (bufp->no_sub));
printf ("not_bol: %d\t", (int) (bufp->not_bol));
printf ("not_eol: %d\t", (int) (bufp->not_eol));
printf ("syntax: %lx\n", (int) (bufp->syntax));
/* Perhaps we should print the translate table? */
}

Expand Down Expand Up @@ -959,7 +959,7 @@ printchar (c)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)

#endif /* not DEBUG */


/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
also be assigned to arbitrarily: each pattern buffer stores its own
syntax, so it can be changed between regex compilations. */
Expand Down Expand Up @@ -2998,7 +2998,7 @@ regex_compile (pattern, size, syntax, bufp)

return REG_NOERROR;
} /* regex_compile */


/* Subroutines for `regex_compile'. */

/* Store OP at LOC followed by two-byte integer parameter ARG. */
Expand Down Expand Up @@ -5780,16 +5780,17 @@ _re_gsub_fixslashes(char *pattern)
the return codes and their meanings.) */

awka_regexp *
awka_regcomp (patt, gsub)
awka_regcomp (patt, gsub, syn)
char *patt;
int gsub;
long syn;
{
awka_regexp *preg;
reg_errcode_t ret;
int cflags = REG_EXTENDED;
static char *pattern = NULL;
static int palloc = 0;
reg_syntax_t syntax = RE_SYNTAX_GNU_AWK;
reg_syntax_t syntax = syn;

preg = (awka_regexp *) malloc(sizeof(awka_regexp));
memset(preg, 0, sizeof(awka_regexp));
Expand Down
Loading

0 comments on commit 35bf648

Please sign in to comment.